library(pdftools);library(tidyverse);library(data.table)
## Warning: package 'pdftools' was built under R version 4.0.2
## Using poppler version 0.73.0
## -- Attaching packages -------------------------------------- tidyverse 1.3.0 --
## <U+221A> ggplot2 3.3.1 <U+221A> purrr 0.3.4
## <U+221A> tibble 3.0.1 <U+221A> dplyr 1.0.0
## <U+221A> tidyr 1.1.0 <U+221A> stringr 1.4.0
## <U+221A> readr 1.3.1 <U+221A> forcats 0.5.0
## -- Conflicts ----------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(splitstackshape);library(tidytext);library(topicmodels)
## Warning: package 'splitstackshape' was built under R version 4.0.2
## Warning: package 'tidytext' was built under R version 4.0.2
## Warning: package 'topicmodels' was built under R version 4.0.2
#ป้อน input หน้าตาคล้ายๆแบบนี้
Text <- readRDS("C:\\Few_pila\\MPR_BOT\\MPR_data.rds")
head(Text,3)
## # A tibble: 3 x 2
## date Text
## <chr> <chr>
## 1 2017_Q1.pdf Advanced economies continued to recover. This was particularly th~
## 2 2017_Q2.pdf The global economy was projected to continue expanding. The U.S. ~
## 3 2017_Q3.pdf The global economy was projected to continue expanding. Advanced ~
#แยกจาก Text ใหญ่ๆ แบ่งด้วยการจบประโยค (end of sentence >> .)
#preprocess ***
#สำคัญที่สุด การแบ่ง text ด้วยประโยค (ลงท้ายด้วย.)
Text$Text <- gsub('([a-z])([a-z])(\\.)','\\1\\2$~',Text$Text)
Text <- cSplit(Text,'Text','$~','long')
Text <- Text %>% ungroup()
Text$Text <- as.character(Text$Text)
Text$Text[1:5]
## [1] "Advanced economies continued to recover$"
## [2] "This was particularly the case for the U.S. economy where labor market conditions strengthened with investment poised to increase$"
## [3] "The euro area economies gained traction through domestic consumption, while the Japanese economy continued to recover at a gradual pace$"
## [4] "The Chinese economy was expected to slow down as a result of the ongoing economic reforms; however, the economic recovery in other Asian economies was expected to continue, driven by exports, especially those of electronic goods which benefited from the technological upcycle and the uptrend of Internet of Things (IoT) products$"
## [5] "Overall, trading partner growth was revised upward$"
#clean_stopword func
preprocess <- function(text_data){
temp_text = c()
for (i in text_data){
i1 = str_split(i,' ') %>% unlist()
#del stopwords
i1 = i1[!(tolower(i1) %in% stop_words$word)]
#combine
i1 = paste(i1,collapse = ' ')
temp_text <- append(temp_text,i1)
}
return(temp_text)
}
#clean_text
Text$Text <- str_squish(Text$Text)
Text$Text = paste(Text$Title,Text$Text)
Text$Text <- gsub("[0-9]+","",Text$Text)
Text$Text <- gsub("\\s*\\([^\\)]+\\)","",Text$Text)
Text$Text <- gsub("'s|´s|'|’s","",Text$Text)
Text$Text <- gsub("[[:punct:]]+"," ",Text$Text)
Text$Text <- gsub("\\s+"," ",Text$Text)
Text$Text <- preprocess(Text$Text)
Text$Text <- tolower(Text$Text)
Text$Text <- textstem::lemmatize_strings(Text$Text)
Text$Text[1:5]
## [1] "advance economy continue recover"
## [2] "economy labor market condition strengthen investment poise increase"
## [3] "euro economy gain traction domestic consumption japanese economy continue recover gradual pace"
## [4] "chinese economy expect slow result ongoing economic reform economic recovery asian economy expect continue drive export electronic benefit technological upcycle uptrend internet product"
## [5] "trade partner growth revise upward"
Text$doc_id <- 1:nrow(Text)
colnames(Text)
## [1] "date" "Text" "doc_id"
#จัดเป็น DTM เพื่อเอาเข้าโมเดล Topic Modeling
count_word <- Text %>% unnest_tokens(input = 'Text',output = 'Terms',
token = 'ngrams' ,n = 3,n_min = 2) %>%
count(Terms,doc_id)
count_word <- count_word[!(grepl('quarter|million|billion|trillion|due|previous|prior|percent',count_word$Terms)),]
count_word <- count_word[!(grepl('committee|due|monetary policy|thai|ahead',count_word$Terms)),]
count_word <- count_word[!(grepl('[0-9]',count_word$Terms)),]
count_word <- count_word[!(is.na(count_word$Terms)),]
count_word
## Terms doc_id n
## 1: ability government 905 1
## 2: ability government agency 905 1
## 3: ability household 822 1
## 4: ability household 885 1
## 5: ability household 951 1
## ---
## 23288: yield seek behavior 58 1
## 23289: yield snapbacks 444 1
## 23290: yield snapbacks affect 444 1
## 23291: yield tenure 86 1
## 23292: yield tenure decline 86 1
the_dtm <- cast_dtm(count_word,doc_id,Terms,n)
the_dtm
## <<DocumentTermMatrix (documents: 1074, terms: 14153)>>
## Non-/sparse entries: 23292/15177030
## Sparsity : 100%
## Maximal term length: 45
## Weighting : term frequency (tf)
ในส่วนนี้ให้ลองเพิ่ม/ลด k ลองดูกับดาต้า ปรับ alpha,delta (ในช่วง 0.01 - 2) ปรับรวมกับ k จนกว่าจะได้โมเดล ลองผิดลองถูกไปเรื่อยๆจนได้โมเดลที่เราชอบ
model = LDA(the_dtm,k = 12,
method = 'Gibbs',
control = list(alpha = 0.1,delta = 2,seed = 999))
#สร้าง function ไว้พลอตเล่นๆ
def <- function(k=8,alpha=0.1,delta=0.3){
model = LDA(the_dtm,k = k,
method = 'Gibbs',
control = list(alpha = alpha,delta = delta,seed = 999))
all_topics = tidy(model,matrix = 'beta')
all_topics$beta <- all_topics$beta*1000
all_top_terms <- all_topics %>% arrange(desc(beta)) %>%
group_by(topic) %>% filter(row_number() <= 10)
x = all_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +ggdark::dark_theme_grey()
return(x)
}
#def(k=12,alpha = 0.1,delta = 2) #great ***
image
เพราะเราแบ่งข้อมูลเปนสองส่วนคือ Text อันแรกพี่ใช้เพื่อทำโมเดลโดยเฉพาะ ตัว doc_id = number(แยกเอกสารด้วยเลขมั่วๆจนครบ) ส่วน input ของเราคือการแปลง Text ให้อยู่ในรูปเอกสารดั้งเดิมของมัน เช่น 2017-Q1
#prepare input
input = Text
count_word_input <- input %>% unnest_tokens(input = 'Text',output = 'Terms',
token = 'ngrams' ,n = 3,n_min = 2) %>%
count(Terms,date)#เปลี่ยนตรงนี้
count_word_input
## Terms date n
## 1: 4 continue 2018_Q2.pdf 1
## 2: 4 continue excess 2018_Q2.pdf 1
## 3: 4 government 2020_Q1.pdf 1
## 4: 4 government implement 2020_Q1.pdf 1
## 5: 4 oversupply 2018_Q1.pdf 1
## ---
## 24003: <NA> 2019_Q1.pdf 1
## 24004: <NA> 2019_Q2.pdf 1
## 24005: <NA> 2019_Q3.pdf 1
## 24006: <NA> 2019_Q4.pdf 1
## 24007: <NA> 2020_Q2.pdf 1
input_dtm <- cast_dtm(count_word_input,date,Terms,n)
model <- readRDS('C:\\Few_pila\\MPR_BOT\\lda_model.rds') #from my macbook
#predict with our model
test_lda <- posterior(model,input_dtm)
classifier <- function(test_lda){
test_topic <- data.table(test_lda[['topics']] %>% as.matrix(),
'doc' = row.names(test_lda[['topics']] %>%
as.matrix()))
test_topic$doc_num <- test_topic$doc %>% as.character() %>%
gsub('_Q|.pdf',"",.) %>% as.numeric()
test_topic <- test_topic %>% arrange(doc_num) %>% select(-doc_num)
return(test_topic)
}
result <- classifier(test_lda = test_lda)
colnames(result) <- c(paste0('num',c(1:12)),'doc')
result
## num1 num2 num3 num4 num5 num6
## 1: 7.551729e-05 7.551729e-05 2.855309e-01 7.551729e-05 0.1073101 7.551729e-05
## 2: 1.288185e-03 6.134217e-05 5.588271e-02 6.134217e-05 0.4024660 6.134217e-05
## 3: 4.959825e-05 9.775816e-02 1.042059e-01 4.513441e-03 0.2053864 3.025494e-03
## 4: 6.034275e-05 2.227251e-01 8.514362e-02 6.034275e-05 0.2553102 6.637702e-04
## 5: 5.248793e-05 2.777136e-01 2.614424e-01 5.773672e-04 0.3113059 5.248793e-05
## 6: 1.591652e-01 1.313208e-02 1.395488e-01 5.448997e-05 0.5335113 5.448997e-05
## 7: 2.398412e-01 3.145627e-03 2.089006e-01 5.156766e-05 0.4724113 5.156766e-05
## 8: 1.071476e-01 6.692545e-05 6.692545e-05 6.692545e-05 0.7235310 6.692545e-05
## 9: 3.346806e-01 1.066790e-02 6.626027e-05 6.626027e-05 0.5897827 7.354890e-03
## 10: 1.806843e-01 1.057116e-01 6.815703e-05 6.815703e-05 0.5153353 3.482824e-02
## 11: 2.282007e-01 4.980937e-03 6.149305e-05 6.149305e-05 0.5036896 6.149305e-05
## 12: 1.267542e-01 1.462448e-01 6.178534e-02 6.496881e-05 0.4256107 6.496881e-05
## 13: 1.083772e-01 1.008640e-01 2.473704e-01 6.260957e-05 0.3124843 6.260957e-05
## 14: 1.560537e-01 5.324247e-05 6.128208e-02 5.324247e-05 0.4526142 1.118092e-03
## num7 num8 num9 num10 num11
## 1: 1.971001e-02 7.551729e-05 7.551729e-05 7.551729e-05 4.818758e-01
## 2: 6.134217e-05 6.134217e-05 6.134217e-05 6.134217e-05 4.257760e-01
## 3: 4.959825e-05 4.959825e-05 4.959825e-05 4.959825e-05 2.768079e-01
## 4: 6.034275e-05 6.034275e-05 6.034275e-05 6.034275e-05 1.032464e-01
## 5: 5.248793e-05 5.773672e-04 5.248793e-05 5.248793e-05 9.453076e-02
## 6: 4.413688e-03 5.448997e-05 5.448997e-05 5.448997e-05 1.499019e-01
## 7: 5.156766e-05 5.156766e-05 5.672442e-04 5.156766e-05 7.379332e-02
## 8: 6.692545e-05 6.692545e-05 6.692545e-05 6.692545e-05 5.695355e-02
## 9: 1.331831e-02 6.626027e-05 6.626027e-05 1.391466e-03 6.626027e-05
## 10: 6.815703e-05 6.815703e-05 6.815703e-05 6.815703e-05 1.301799e-02
## 11: 6.149305e-05 6.764236e-04 6.149305e-05 6.149305e-05 2.263559e-01
## 12: 6.496881e-05 6.496881e-05 6.496881e-05 6.496881e-05 7.023129e-02
## 13: 3.449787e-02 6.260957e-05 1.314801e-03 6.260957e-05 1.947784e-01
## 14: 2.359174e-01 1.118092e-03 5.324247e-05 5.324247e-05 9.163028e-02
## num12 doc
## 1: 1.050446e-01 2017_Q1.pdf
## 2: 1.141578e-01 2017_Q2.pdf
## 3: 3.080548e-01 2017_Q3.pdf
## 4: 3.325489e-01 2017_Q4.pdf
## 5: 5.359017e-02 2018_Q1.pdf
## 6: 5.448997e-05 2018_Q2.pdf
## 7: 1.082921e-03 2018_Q3.pdf
## 8: 1.118324e-01 2018_Q4.pdf
## 9: 4.247283e-02 2019_Q1.pdf
## 10: 1.500136e-01 2019_Q2.pdf
## 11: 3.572746e-02 2019_Q3.pdf
## 12: 1.689839e-01 2019_Q4.pdf
## 13: 6.260957e-05 2020_Q1.pdf
## 14: 5.324247e-05 2020_Q2.pdf
#จัดกลุ่ม Topic ตามที่เราคิด (ไม่มีถูกผิดเพราะเป็น cluster)
topic_combine <- data.table('doc' = result$doc,
'monetary_policy' = result$num1,
'macro&fiscal' = result$num2,
'macro' = result$num4+
result$num8+result$num10+result$num6,
'exchange_rate' = result$num9,
'price&inflation' =result$num11,
'financial worry' = result$num12,
'financial' = result$num3,
'Trade' = result$num5,
'Tourism' = result$num7)
topic_combine[,c(2:10)] <- topic_combine[,c(2:10)] * 100
topic_combine
## doc monetary_policy macro&fiscal macro exchange_rate
## 1: 2017_Q1.pdf 0.007551729 0.007551729 0.03020692 0.007551729
## 2: 2017_Q2.pdf 0.128818550 0.006134217 0.02453687 0.006134217
## 3: 2017_Q3.pdf 0.004959825 9.775815891 0.76381311 0.004959825
## 4: 2017_Q4.pdf 0.006034275 22.272507845 0.08447985 0.006034275
## 5: 2018_Q1.pdf 0.005248793 27.771362587 0.12597103 0.005248793
## 6: 2018_Q2.pdf 15.916521360 1.313208370 0.02179599 0.005448997
## 7: 2018_Q3.pdf 23.984117162 0.314562706 0.02062706 0.056724422
## 8: 2018_Q4.pdf 10.714763753 0.006692545 0.02677018 0.006692545
## 9: 2019_Q1.pdf 33.468062550 1.066790353 0.88788762 0.006626027
## 10: 2019_Q2.pdf 18.068429662 10.571155943 3.50327154 0.006815703
## 11: 2019_Q3.pdf 22.820071332 0.498093715 0.08609027 0.006149305
## 12: 2019_Q4.pdf 12.675415800 14.624480249 0.02598753 0.006496881
## 13: 2020_Q1.pdf 10.837716003 10.086401202 0.02504383 0.131480090
## 14: 2020_Q2.pdf 15.605366841 0.005324247 0.23426685 0.005324247
## price&inflation financial worry financial Trade Tourism
## 1: 48.187584957 10.504455520 28.553088657 10.73101 1.971001359
## 2: 42.577597841 11.415777205 5.588271378 40.24660 0.006134217
## 3: 27.680785636 30.805475647 10.420593195 20.53864 0.004959825
## 4: 10.324643978 33.254887762 8.514361574 25.53102 0.006034275
## 5: 9.453075793 5.359017426 26.144236826 31.13059 0.005248793
## 6: 14.990191805 0.005448997 13.954882302 53.35113 0.441368788
## 7: 7.379331683 0.108292079 20.890057756 47.24113 0.005156766
## 8: 5.695355374 11.183241869 0.006692545 72.35310 0.006692545
## 9: 0.006626027 4.247283329 0.006626027 58.97827 1.331831434
## 10: 1.301799346 15.001363141 0.006815703 51.53353 0.006815703
## 11: 22.635592178 3.572746280 0.006149305 50.36896 0.006149305
## 12: 7.023128898 16.898388773 6.178534304 42.56107 0.006496881
## 13: 19.477836213 0.006260957 24.737039820 31.24843 3.449787127
## 14: 9.163028431 0.005324247 6.128207859 45.26142 23.591736769
extract_side_words <- function(word_dum,input_text){
num <- input[grepl(word_dum,input_text),]
num$left <-gsub(paste0('(.*) (.*?) ',word_dum,' (.*?) (.*)'),'\\2',num$Text)
num$right <-gsub(paste0('(.*) (.*?) ',word_dum,' (.*?) (.*)'),'\\3',num$Text)
#left side
for (i in 1:nrow(num)) {
if(nchar(num$left[i]) > 20 ){
word1 <- gsub(paste0('(.*?) ',word_dum,' (.*?) (.*)'),'\\1',num$Text[i])
word2 <- gsub(paste0('(.*) (.*?) ',word_dum,' (.*)'),'\\2',num$Text[i])
word3 <- gsub(paste0('(.*) (.*?) ',word_dum),'\\2',num$Text[i])
score <- rbind(word1,word2,word3) %>% as.vector()
score <- score %>% data.frame('text' = .)%>% mutate(len = nchar(text)) %>% filter(len < 20)
if(nrow(score) == 0){
word1 <- gsub(paste0('(.*) (.*?) ',word_dum,' (.*?)'),'\\3',num$Text[i])
word2 <- gsub(paste0('(.*?) ',word_dum,' (.*?) (.*)'),'\\1',num$Text[i])
word3 <- gsub(paste0(word_dum,' (.*?) (.*)'),'\\1',num$Text[i])
word4 <- gsub(paste0('(.*)',word_dum,' (.*)'),'\\2',num$Text[i])
score <- rbind(word1,word2,word3,word4) %>% as.vector()
score <- score %>% data.frame('text' = .)%>% mutate(len = nchar(text)) %>% filter(len < 20)
if(nrow(score) == 0){
num$left[i] <- num$left[i]
}else{
num$left[i] <- score$text[1]
}
}else if (nrow(score) != 0){
num$left[i] <- score$text[1]
}
}
}
#right side
for (i in 1:nrow(num)) {
if(nchar(num$right[i]) > 20 ){
word1 <- gsub(paste0('(.*?) ',word_dum,' (.*?) (.*)'),'\\1',num$Text[i])
word2 <- gsub(paste0('(.*) (.*?) ',word_dum,' (.*)'),'\\2',num$Text[i])
word3 <- gsub(paste0('(.*) (.*?) ',word_dum),'\\2',num$Text[i])
score <- rbind(word1,word2,word3) %>% as.vector()
score <- score %>% data.frame('text' = .)%>% mutate(len = nchar(text)) %>% filter(len < 20)
if(nrow(score) == 0){
word1 <- gsub(paste0('(.*) (.*?) ',word_dum,' (.*?)'),'\\3',num$Text[i])
word2 <- gsub(paste0('(.*?) ',word_dum,' (.*?) (.*)'),'\\1',num$Text[i])
word3 <- gsub(paste0(word_dum,' (.*?) (.*)'),'\\1',num$Text[i])
word4 <- gsub(paste0('(.*)',word_dum,' (.*)'),'\\2',num$Text[i])
score <- rbind(word1,word2,word3,word4) %>% as.vector()
score <- score %>% data.frame('text' = .)%>% mutate(len = nchar(text)) %>% filter(len < 20)
if(nrow(score) == 0){
num$right[i] <- num$right[i]
}else{
num$right[i] <- score$text[1]
}
}else if (nrow(score) != 0){
num$right[i] <- score$text[1]
}
}
}
#check position
word <- append(num$left,num$right) %>% unique()
left <- data.table()
for (i in 1:length(word)) {
left_word <- paste(word[i],word_dum)
count_left = sum(grepl(left_word,num$Text))
if(count_left != 0){
data_left <- data.table('side_word' = word[i],'direction' = 'left_side',
'count' = count_left)
left <- rbind(left,data_left)
}else{next}
}
right <- data.table()
for (i in 1:length(word)) {
right_word <- paste(word_dum,word[i])
count_right = sum(grepl(right_word,num$Text))
if(count_right != 0){
data_right <- data.table('side_word' = word[i],'direction' = 'right_side',
'count' = count_right)
right <- rbind(right,data_right)
}else{next}
}
combine <- rbind(left,right)
combine$target <- word_dum
combine <- combine %>% select(target,side_word,direction,count) %>% arrange(desc(count))
return(combine)
}
#แนวคิดคือเอาคำ ข้างซ้าย หรือข้างขวาของ keyword ที่เราสกัดออกมาจาก LDA มาทำ sentiment เช่น policy rate cut >> ลดอัตราดอกเบี้ยนโยบาย ส่งผลทางไหน
#example
extract_side_words('policy rate',input$Text)
## target side_word direction count
## 1: policy rate percent right_side 25
## 2: policy rate maintain left_side 16
## 3: policy rate raise left_side 13
## 4: policy rate cut left_side 13
## 5: policy rate percentage right_side 11
## 6: policy rate cut right_side 8
## 7: policy rate real left_side 5
## 8: policy rate hold right_side 5
## 9: policy rate increase right_side 4
## 10: policy rate vote left_side 3
## 11: policy rate remain left_side 3
## 12: policy rate assessment left_side 3
## 13: policy rate increase left_side 3
## 14: policy rate reduce left_side 3
## 15: policy rate include left_side 3
## 16: policy rate line left_side 3
## 17: policy rate remain right_side 3
## 18: policy rate support right_side 3
## 19: policy rate unchanged right_side 3
## 20: policy rate due right_side 3
## 21: policy rate microprudential right_side 3
## 22: policy rate bank left_side 2
## 23: policy rate view left_side 2
## 24: policy rate expect left_side 2
## 25: policy rate lower left_side 2
## 26: policy rate low left_side 2
## 27: policy rate curb right_side 2
## 28: policy rate half right_side 2
## 29: policy rate close left_side 1
## 30: policy rate current left_side 1
## 31: policy rate unanimously left_side 1
## 32: policy rate rise left_side 1
## 33: policy rate ahead left_side 1
## 34: policy rate accommodative left_side 1
## 35: policy rate currency left_side 1
## 36: policy rate international left_side 1
## 37: policy rate negative left_side 1
## 38: policy rate purchase left_side 1
## 39: policy rate hold left_side 1
## 40: policy rate bank right_side 1
## 41: policy rate raise right_side 1
## 42: policy rate low right_side 1
## 43: policy rate federal right_side 1
## 44: policy rate march right_side 1
## 45: policy rate government right_side 1
## 46: policy rate stabilize right_side 1
## 47: policy rate weak right_side 1
## 48: policy rate 4 right_side 1
## 49: policy rate previous right_side 1
## 50: policy rate continuously right_side 1
## 51: policy rate additional right_side 1
## 52: policy rate majority right_side 1
## 53: policy rate september right_side 1
## 54: policy rate october right_side 1
## 55: policy rate decline right_side 1
## 56: policy rate financial right_side 1
## 57: policy rate range right_side 1
## 58: policy rate announce right_side 1
## 59: policy rate regional right_side 1
## 60: policy rate compute right_side 1
## 61: policy rate reduction right_side 1
## target side_word direction count
#select our main word from each topic
all_topics = tidy(model,matrix = 'beta')
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
all_topics$beta <- all_topics$beta*1000
all_top_terms <- all_topics %>% arrange(desc(beta)) %>%
group_by(topic) %>% filter(row_number() <= 100)
#top 100 เลือกคำอันดับtop 100
filtered_word <- all_top_terms %>% group_by(topic) %>% filter(beta > median(beta))
#filter only > median แล้วเลือก > median beta
#map to topic_name
group_topic <- function(topic_num){
topic_num <- as.character(topic_num)
topic_num <- str_split(topic_num,pattern = ',') %>% unlist()
if(length(topic_num > 1)){
topic_num = topic_num[1]
}else{topic_num = topic_num}
if(topic_num == '1'){
value = 'monetary_policy'
}else if(topic_num == '2'){
value = 'macro&fiscal'
}else if(topic_num == '4'|topic_num =='6'|
topic_num == '8'|topic_num == '10'){
value = 'macro'
}else if(topic_num == '3'){
value ='financial'
}else if(topic_num == '9'){
value = 'exchange_rate'
}else if(topic_num == '5'){
value = 'Trade'
}else if(topic_num == '7'){
value = 'Tourism'
}else if(topic_num == '11'){
value = 'price&inflation'
}else if(topic_num == '12'){
value = 'financial worry'
}
return(value)
}
filtered_word <- filtered_word %>% mutate(topic_name = group_topic(topic))
filtered_word
## # A tibble: 435 x 4
## # Groups: topic [12]
## topic term beta topic_name
## <int> <chr> <dbl> <chr>
## 1 1 policy rate 2.45 monetary_policy
## 2 12 financial stability 1.92 financial worry
## 3 11 headline inflation 1.71 price&inflation
## 4 5 trade partner 1.71 Trade
## 5 1 central bank 1.50 monetary_policy
## 6 3 bond yield 1.39 financial
## 7 11 oil price 1.32 price&inflation
## 8 5 merchandise export 1.22 Trade
## 9 2 private consumption 1.19 macro&fiscal
## 10 11 core inflation 1.19 price&inflation
## # ... with 425 more rows
topic_name1 <- filtered_word %>% filter(topic_name == 'monetary_policy')
topic_side_word <- data.table()
for (i in topic_name1$term) {
temp <- extract_side_words(i,input$Text)
topic_side_word <- rbind(topic_side_word,temp)
}
topic_side_word
## target side_word direction count
## 1: policy rate percent right_side 25
## 2: policy rate maintain left_side 16
## 3: policy rate raise left_side 13
## 4: policy rate cut left_side 13
## 5: policy rate percentage right_side 11
## ---
## 474: regional central rate left_side 1
## 475: regional central bank include right_side 2
## 476: regional central bank ease right_side 2
## 477: regional central bank hand left_side 1
## 478: regional central bank rate left_side 1
#หลังจากค้นหา sentiment ได้ครบ
sentiment_bot <- read.csv('C:\\Few_pila\\MPR_BOT\\sentiment_bot-Sheet1.csv')
sentiment_bot_special <-read.csv('C:\\Few_pila\\MPR_BOT\\sentiment_bot_special-Sheet1.csv')
sentiment_bot
## topic terms direction
## 1 1 increase up
## 2 1 raise up
## 3 1 rise up
## 4 1 facilitate up
## 5 1 improve up
## 6 1 expand up
## 7 1 implement up
## 8 1 improvement up
## 9 1 outperform up
## 10 1 acommodative up
## 11 1 increase up
## 12 1 raise up
## 13 1 improve up
## 14 1 relaxation up
## 15 1 maintain neutral
## 16 1 remain neutral
## 17 1 hold neutral
## 18 1 cut down
## 19 1 volatile down
## 20 1 weaken down
## 21 1 dovish down
## 22 1 reduce down
## 23 1 weak down
## 24 1 reduction down
## 25 1 acommodative down
## 26 2 acommodative up
## 27 2 increase up
## 28 2 raise up
## 29 2 improve up
## 30 2 relaxation up
## 31 2 remain neutral
## 32 2 maintain neutral
## 33 2 hold neutral
## 34 2 stay neutral
## 35 2 decrease down
## 36 2 fall down
## 37 2 negative down
## 38 2 vulnerability down
## 39 2 negative down
## 40 2 cut down
## 41 2 decline down
## 42 7 expand up
## 43 7 rise up
## 44 7 improve up
## 45 7 increase up
## 46 7 lead up
## 47 7 relaxation up
## 48 7 improvement up
## 49 7 expansion up
## 50 7 slowdown down
## 51 7 covid down
## 52 7 disruption down
## 53 7 short down
## 54 7 decreasedrop down
## 55 7 fragile down
## 56 7 pandemic down
## 57 7 outbreak down
## 58 7 decelerate down
## 59 5 continue up
## 60 5 improvement up
## 61 5 outperform up
## 62 5 sustain up
## 63 5 strength up
## 64 5 stimulate up
## 65 5 stimulus up
## 66 5 remain neutral
## 67 5 maintain neutral
## 68 5 slowdown down
## 69 5 intensify down
## 70 5 slowdown down
## 71 5 tension down
## 72 5 contraction down
## 73 5 pandemic down
## 74 5 protectionist down
## 75 5 short down
## 76 5 downard down
## 77 5 decline down
## 78 9 strengthen up
## 79 9 remain neutral
## 80 9 depreciate down
## 81 3 accomodative up
## 82 3 increase up
## 83 3 raise up
## 84 3 rise up
## 85 3 addition up
## 86 3 improve up
## 87 3 remain neutral
## 88 3 hold neutral
## 89 3 unchange neutral
## 90 3 stay neutral
## 91 3 cut down
## 92 3 decline down
## 93 3 fall down
## 94 3 tighten down
## 95 3 reduction down
## 96 11 increase up
## 97 11 rise up
## 98 11 pull up
## 99 11 improve up
## 100 11 upward up
## 101 11 remain neutral
## 102 11 hold neutral
## 103 11 unchanged neutral
## 104 11 low down
## 105 11 decline down
## 106 11 downward down
## 107 11 negative down
## 108 11 fall down
## 109 11 downside down
## 110 11 decrease down
## 111 11 decelerate down
## 112 11 reduction down
## 113 4,6,8,10 continue up
## 114 4,6,8,10 expansion up
## 115 4,6,8,10 accelerate up
## 116 4,6,8,10 achieve up
## 117 4,6,8,10 gain up
## 118 4,6,8,10 extension up
## 119 4,6,8,10 continuous up
## 120 4,6,8,10 increase up
## 121 4,6,8,10 remain neutral
## 122 4,6,8,10 concern down
## 123 4,6,8,10 deterioration down
## 124 4,6,8,10 disruption down
## 125 4,6,8,10 tighten down
## 126 4,6,8,10 facilitate down
## 127 4,6,8,10 weak down
## 128 4,6,8,10 decrease down
## 129 4,6,8,10 contraction down
อันนี้้ในกรณีที่เรารู้คำเต็มก็เอามาใส่ได้เลย
sentiment_bot_special
## topic special_word direction
## 1 1 accomodative monetary up
## 2 2 continue expand up
## 3 2 trade protectionism down
## 4 12 concern financial stability down
## 5 12 risk financial stability down
## 6 12 facilitate economic growth down
## 7 12 pose vulnerabillity financial down
## 8 12 build vulnerabillity financial down
## 9 12 buildup vulnerabillity financial down
## 10 12 consistent economic growth neutral
## 11 12 remain inflation target neutral
## 12 12 remain financial stability up
## 13 12 improve financial stability up
## 14 12 ensure financial stability up
## 15 12 continutation economic growth up
## 16 12 continue economic growth up
## 17 12 rise economic growth up
## 18 12 continuous economic growth up
## 19 12 expansion economic growth up
## 20 12 gain economic growth up
## 21 12 remain pocket risk up
## 22 12 rise inflation target up
## 23 12 increase inflation target up
sentiment_bot <- sentiment_bot %>% group_by(topic) %>% mutate(topic_name = group_topic(topic))
sentiment_bot_special <- sentiment_bot_special %>% group_by(topic) %>% mutate(topic_name = group_topic(topic))
sentiment_bot
## # A tibble: 129 x 4
## # Groups: topic [8]
## topic terms direction topic_name
## <chr> <chr> <chr> <chr>
## 1 1 increase up monetary_policy
## 2 1 raise up monetary_policy
## 3 1 rise up monetary_policy
## 4 1 facilitate up monetary_policy
## 5 1 improve up monetary_policy
## 6 1 expand up monetary_policy
## 7 1 implement up monetary_policy
## 8 1 improvement up monetary_policy
## 9 1 outperform up monetary_policy
## 10 1 acommodative up monetary_policy
## # ... with 119 more rows
#สกัด key word + sentiment ไปหาใน input (เอกสารของเรา)
topic_name_x <- unique(filtered_word$topic_name)[1]
topic_word <- filtered_word %>% filter(topic_name == topic_name_x)
side_word <-data.table()
for (i in topic_word$term) {
temp <- extract_side_words(i,input$Text)
side_word <- rbind(side_word,temp)
}
direction_word <- sentiment_bot %>% filter(topic_name == topic_name_x)
colnames(direction_word)[2] <- 'side_word'
colnames(direction_word)[3] <- 'sentiment'
combine = direction_word %>% inner_join(side_word,by = 'side_word') %>% mutate(full_word = ifelse(direction == 'right_side',paste(target,side_word),
paste(side_word,target))) %>% ungroup()
combine
## # A tibble: 81 x 8
## topic side_word sentiment topic_name target direction count full_word
## <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr>
## 1 1 increase up monetary_po~ policy ~ right_si~ 4 policy rate ~
## 2 1 increase up monetary_po~ policy ~ left_side 3 increase pol~
## 3 1 increase up monetary_po~ inflati~ left_side 1 increase inf~
## 4 1 increase up monetary_po~ bank re~ right_si~ 1 bank region ~
## 5 1 increase up monetary_po~ build p~ left_side 2 increase bui~
## 6 1 increase up monetary_po~ central~ right_si~ 1 central bank~
## 7 1 increase up monetary_po~ domesti~ right_si~ 1 domestic dem~
## 8 1 raise up monetary_po~ policy ~ left_side 13 raise policy~
## 9 1 raise up monetary_po~ policy ~ right_si~ 1 policy rate ~
## 10 1 raise up monetary_po~ central~ right_si~ 3 central bank~
## # ... with 71 more rows
sentiment_word <- combine %>% select(full_word,sentiment,topic_name)
sentiment_word
## # A tibble: 81 x 3
## full_word sentiment topic_name
## <chr> <chr> <chr>
## 1 policy rate increase up monetary_policy
## 2 increase policy rate up monetary_policy
## 3 increase inflation target up monetary_policy
## 4 bank region increase up monetary_policy
## 5 increase build policy up monetary_policy
## 6 central bank region increase up monetary_policy
## 7 domestic demand increase up monetary_policy
## 8 raise policy rate up monetary_policy
## 9 policy rate raise up monetary_policy
## 10 central bank raise up monetary_policy
## # ... with 71 more rows
#by date
sentiment_count <- data.table()
for (i in unique(input$date)) {
input_data <- input %>% filter(date == i)
temp <- sentiment_word %>% group_by(full_word) %>% mutate(count = sum(grepl(full_word,input_data$Text)))
temp$doc <- i
sentiment_count <- rbind(sentiment_count,temp)
}
sentiment_count
## full_word sentiment topic_name count doc
## 1: policy rate increase up monetary_policy 0 2017_Q1.pdf
## 2: increase policy rate up monetary_policy 0 2017_Q1.pdf
## 3: increase inflation target up monetary_policy 0 2017_Q1.pdf
## 4: bank region increase up monetary_policy 0 2017_Q1.pdf
## 5: increase build policy up monetary_policy 0 2017_Q1.pdf
## ---
## 1130: policy stance dovish down monetary_policy 0 2020_Q2.pdf
## 1131: central bank maintain dovish down monetary_policy 0 2020_Q2.pdf
## 1132: reduce policy rate down monetary_policy 0 2020_Q2.pdf
## 1133: policy rate weak down monetary_policy 0 2020_Q2.pdf
## 1134: policy rate reduction down monetary_policy 1 2020_Q2.pdf
sentiment_only <- sentiment_count %>% group_by(sentiment,doc) %>%
summarise(count = sum(count)) %>% ungroup()
## `summarise()` regrouping output by 'sentiment' (override with `.groups` argument)
sentiment_only
## # A tibble: 42 x 3
## sentiment doc count
## <chr> <chr> <int>
## 1 down 2017_Q1.pdf 1
## 2 down 2017_Q2.pdf 1
## 3 down 2017_Q3.pdf 1
## 4 down 2017_Q4.pdf 0
## 5 down 2018_Q1.pdf 0
## 6 down 2018_Q2.pdf 1
## 7 down 2018_Q3.pdf 2
## 8 down 2018_Q4.pdf 1
## 9 down 2019_Q1.pdf 0
## 10 down 2019_Q2.pdf 6
## # ... with 32 more rows
sentiment_ratio <- sentiment_only %>% spread(sentiment,count) %>%
mutate(sentiment_ratio = (up-down)/(sum(up+down+neutral)) * 100)
sentiment_ratio #for one topic
## # A tibble: 14 x 5
## doc down neutral up sentiment_ratio
## <chr> <int> <int> <int> <dbl>
## 1 2017_Q1.pdf 1 1 0 -0.483
## 2 2017_Q2.pdf 1 3 3 0.966
## 3 2017_Q3.pdf 1 4 6 2.42
## 4 2017_Q4.pdf 0 4 9 4.35
## 5 2018_Q1.pdf 0 4 9 4.35
## 6 2018_Q2.pdf 1 5 22 10.1
## 7 2018_Q3.pdf 2 9 23 10.1
## 8 2018_Q4.pdf 1 3 10 4.35
## 9 2019_Q1.pdf 0 10 5 2.42
## 10 2019_Q2.pdf 6 7 2 -1.93
## 11 2019_Q3.pdf 12 3 3 -4.35
## 12 2019_Q4.pdf 9 6 1 -3.86
## 13 2020_Q1.pdf 8 5 1 -3.38
## 14 2020_Q2.pdf 2 4 1 -0.483
wrap it up
sentiment_data <- data.table()
for (name in unique(filtered_word$topic_name)) {
topic_name_x = name
topic_word <- filtered_word %>% filter(topic_name == topic_name_x)
side_word <-data.table()
for (i in topic_word$term) {
temp <- extract_side_words(i,input$Text)
side_word <- rbind(side_word,temp)
}
direction_word <- sentiment_bot %>% filter(topic_name == topic_name_x)
colnames(direction_word)[2] <- 'side_word'
colnames(direction_word)[3] <- 'sentiment'
combine = direction_word %>% inner_join(side_word,by = 'side_word') %>%
mutate(full_word = ifelse(direction == 'right_side',paste(target,side_word),
paste(side_word,target))) %>% ungroup()
sentiment_word <- combine %>% select(full_word,sentiment,topic_name)
#combine with special case
special_word <- sentiment_bot_special %>% filter(topic_name == topic_name_x)
if(nrow(special_word) != 0){
special_word <- special_word %>% ungroup() %>% select(special_word,direction,topic_name)
colnames(special_word) <- c('full_word','sentiment','topic_name')
sentiment_word <- rbind(sentiment_word,special_word)
}else{sentiment_word = sentiment_word}
#by date
sentiment_count <- data.table()
for (i in unique(input$date)) {
input_data <- input %>% filter(date == i)
temp <- sentiment_word %>% group_by(full_word) %>%
mutate(count = sum(grepl(full_word,input_data$Text)))
temp$doc <- i
sentiment_count <- rbind(sentiment_count,temp)
}
#combine count of sentiment
sentiment_only <- sentiment_count %>% group_by(sentiment,doc) %>%
summarise(count = sum(count)) %>% ungroup()
if(grepl('neutral',sentiment_only$sentiment) == T){
#sentiment ratio positive-negative/ (positive+negative+neutral)
sentiment_ratio <- sentiment_only %>% spread(sentiment,count) %>%
mutate(sentiment_ratio = (up-down)/(sum(up+down+neutral)) * 100)
}else{
sentiment_ratio <- sentiment_only %>% spread(sentiment,count) %>%
mutate(sentiment_ratio = (up-down)/(sum(up+down)) * 100)
sentiment_ratio$neutral = 0
sentiment_ratio <- sentiment_ratio %>% select(doc,down,neutral,up,
sentiment_ratio)
}
sentiment_ratio$topic_name <- paste0('sentiment_',topic_name_x)
#BIND
sentiment_data <- rbind(sentiment_data,sentiment_ratio)
}
sentiment_data
## doc down neutral up sentiment_ratio topic_name
## 1: 2017_Q1.pdf 1 0 0 -0.7194245 sentiment_monetary_policy
## 2: 2017_Q2.pdf 1 0 3 1.4388489 sentiment_monetary_policy
## 3: 2017_Q3.pdf 1 0 6 3.5971223 sentiment_monetary_policy
## 4: 2017_Q4.pdf 0 0 9 6.4748201 sentiment_monetary_policy
## 5: 2018_Q1.pdf 0 0 9 6.4748201 sentiment_monetary_policy
## ---
## 122: 2019_Q2.pdf 5 0 0 -7.3529412 sentiment_Tourism
## 123: 2019_Q3.pdf 3 0 0 -4.4117647 sentiment_Tourism
## 124: 2019_Q4.pdf 0 0 0 0.0000000 sentiment_Tourism
## 125: 2020_Q1.pdf 11 0 1 -14.7058824 sentiment_Tourism
## 126: 2020_Q2.pdf 14 0 5 -13.2352941 sentiment_Tourism
#combine them together
col_word = colnames(topic_combine)[c(2:10)]
topic_sentiment <- data.table()
for (word in col_word) {
key_word <- paste0('sentiment_',word)
one_topic = sentiment_data %>% filter(topic_name == key_word) %>% select(sentiment_ratio)
colnames(one_topic) = key_word
topic_sentiment <- cbind(topic_sentiment,one_topic)
}
#final result
topic_modeling <- data.table(topic_combine,topic_sentiment)
topic_modeling <- topic_modeling %>% select(doc,monetary_policy,sentiment_monetary_policy,
`macro&fiscal`,`sentiment_macro&fiscal`,
macro,sentiment_macro,
exchange_rate,sentiment_exchange_rate,
`price&inflation`,`sentiment_price&inflation`,
`financial worry`,`sentiment_financial worry`,
financial,sentiment_financial,
Trade,sentiment_Trade,
Tourism,sentiment_Tourism)
topic_modeling
## doc monetary_policy sentiment_monetary_policy macro&fiscal
## 1: 2017_Q1.pdf 0.007551729 -0.7194245 0.007551729
## 2: 2017_Q2.pdf 0.128818550 1.4388489 0.006134217
## 3: 2017_Q3.pdf 0.004959825 3.5971223 9.775815891
## 4: 2017_Q4.pdf 0.006034275 6.4748201 22.272507845
## 5: 2018_Q1.pdf 0.005248793 6.4748201 27.771362587
## 6: 2018_Q2.pdf 15.916521360 15.1079137 1.313208370
## 7: 2018_Q3.pdf 23.984117162 15.1079137 0.314562706
## 8: 2018_Q4.pdf 10.714763753 6.4748201 0.006692545
## 9: 2019_Q1.pdf 33.468062550 3.5971223 1.066790353
## 10: 2019_Q2.pdf 18.068429662 -2.8776978 10.571155943
## 11: 2019_Q3.pdf 22.820071332 -6.4748201 0.498093715
## 12: 2019_Q4.pdf 12.675415800 -5.7553957 14.624480249
## 13: 2020_Q1.pdf 10.837716003 -5.0359712 10.086401202
## 14: 2020_Q2.pdf 15.605366841 -0.7194245 0.005324247
## sentiment_macro&fiscal macro sentiment_macro exchange_rate
## 1: 3.846154 0.03020692 0.000000 0.007551729
## 2: 8.974359 0.02453687 7.608696 0.006134217
## 3: 8.974359 0.76381311 6.521739 0.004959825
## 4: 11.538462 0.08447985 6.521739 0.006034275
## 5: 16.666667 0.12597103 10.869565 0.005248793
## 6: 6.410256 0.02179599 3.260870 0.005448997
## 7: -6.410256 0.02062706 4.347826 0.056724422
## 8: -10.256410 0.02677018 -1.086957 0.006692545
## 9: 3.846154 0.88788762 3.260870 0.006626027
## 10: 2.564103 3.50327154 -4.347826 0.006815703
## 11: 1.282051 0.08609027 1.086957 0.006149305
## 12: 1.282051 0.02598753 1.086957 0.006496881
## 13: 0.000000 0.02504383 -1.086957 0.131480090
## 14: -2.564103 0.23426685 -3.260870 0.005324247
## sentiment_exchange_rate price&inflation sentiment_price&inflation
## 1: 0.00000 48.187584957 2.0270270
## 2: 0.00000 42.577597841 -2.0270270
## 3: 0.00000 27.680785636 -5.4054054
## 4: 0.00000 10.324643978 2.0270270
## 5: 0.00000 9.453075793 0.6756757
## 6: 33.33333 14.990191805 5.4054054
## 7: 0.00000 7.379331683 2.7027027
## 8: -33.33333 5.695355374 -2.7027027
## 9: 0.00000 0.006626027 -2.0270270
## 10: 0.00000 1.301799346 0.0000000
## 11: 0.00000 22.635592178 4.0540541
## 12: 0.00000 7.023128898 1.3513514
## 13: -33.33333 19.477836213 -4.0540541
## 14: 0.00000 9.163028431 -2.0270270
## financial worry sentiment_financial worry financial sentiment_financial
## 1: 10.504455520 3.448276 28.553088657 6.8376068
## 2: 11.415777205 10.344828 5.588271378 -2.5641026
## 3: 30.805475647 13.793103 10.420593195 -7.6923077
## 4: 33.254887762 -6.896552 8.514361574 4.2735043
## 5: 5.359017426 -3.448276 26.144236826 5.1282051
## 6: 0.005448997 3.448276 13.954882302 9.4017094
## 7: 0.108292079 10.344828 20.890057756 11.1111111
## 8: 11.183241869 3.448276 0.006692545 4.2735043
## 9: 4.247283329 -3.448276 0.006626027 3.4188034
## 10: 15.001363141 0.000000 0.006815703 -0.8547009
## 11: 3.572746280 3.448276 0.006149305 -4.2735043
## 12: 16.898388773 10.344828 6.178534304 -5.1282051
## 13: 0.006260957 0.000000 24.737039820 -5.9829060
## 14: 0.005324247 0.000000 6.128207859 -5.1282051
## Trade sentiment_Trade Tourism sentiment_Tourism
## 1: 10.73101 1.9108280 1.971001359 0.000000
## 2: 40.24660 3.1847134 0.006134217 1.470588
## 3: 20.53864 6.3694268 0.004959825 2.941176
## 4: 25.53102 3.1847134 0.006034275 8.823529
## 5: 31.13059 3.8216561 0.005248793 4.411765
## 6: 53.35113 3.8216561 0.441368788 1.470588
## 7: 47.24113 1.2738854 0.005156766 2.941176
## 8: 72.35310 -3.1847134 0.006692545 1.470588
## 9: 58.97827 -4.4585987 1.331831434 1.470588
## 10: 51.53353 -9.5541401 0.006815703 -7.352941
## 11: 50.36896 -7.6433121 0.006149305 -4.411765
## 12: 42.56107 -3.1847134 0.006496881 0.000000
## 13: 31.24843 -0.6369427 3.449787127 -14.705882
## 14: 45.26142 -1.9108280 23.591736769 -13.235294
#topic_modeling %>% select()