資料來源

Inchape shipping service :
https://www.iss-shipping.com/news-and-media
時間:
2018-05-30~2020-03-19(共323篇)

匯入 package

require("tm")
library(qdap)
library(tidyr)
library(dplyr)
require(readr)
require(lubridate)
require(stringr)
library(textstem)
library(tidytext)
library(topicmodels)
library(tidyverse)

匯入資料

original_data<-read_csv("C:/Users/VivoBook/Desktop/study/china_steel/join/data_318/allnewdata.csv" ,col_names = T,
              ,col_types = cols( date = col_character(), title =col_character(),body =col_character())
              ,locale = locale(encoding = "Big-5"))

前處理

original_data$news_id=c(1:nrow(original_data))   #標註新聞ID
original_data$date<-ymd(original_data$date)      #時間轉格式
original_data$date=original_data$date+1          #時間+1天
news_range<-range(original_data$date)            #"2018-05-31" "2020-03-20"#
original_data$body<-str_replace_all(original_data$body,"[[:punct:]]","") #刪除標點

斷詞

var_stopwords<-stopwords()
tmp=original_data %>%unnest_tokens(word, body) %>% #斷詞
  filter(!grepl("[^a-zA-Z]",word))%>%     
  mutate(word=lemmatize_strings(word))%>% #還原詞幹
  group_by(word,news_id)%>%            
  summarise(count=n()) 
## Registered S3 methods overwritten by 'textclean':
##   method           from
##   print.check_text qdap
##   print.sub_holder qdap
id=which(tmp$word%in%var_stopwords)       #去除停用字
tfm=tmp[-id,]
tfm[100:105,]
## # A tibble: 6 x 3
## # Groups:   word [4]
##   word          news_id count
##   <chr>           <int> <int>
## 1 aaprnwoodside      98     1
## 2 aaron              66     1
## 3 aaron              76     1
## 4 abaco              79     2
## 5 abaco              82     2
## 6 abadi              95     4

資料格式轉DTM

dtm<-cast_dtm(tfm,news_id,word,count)
dtm
## <<DocumentTermMatrix (documents: 323, terms: 9903)>>
## Non-/sparse entries: 66833/3131836
## Sparsity           : 98%
## Maximal term length: NA
## Weighting          : term frequency (tf)
news_time=original_data$date

以LDA分群

以AIC判定分群數量

best.model <- lapply(seq(2,22, by=4), function(k){LDA(dtm, k,control = list(seed = 1234))})
beepr::beep(8)
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, AIC)))
best.model.logLik.df <- data.frame(topics=seq(2,22, by=4), LL=as.numeric(as.matrix(best.model.logLik)))
best.model.logLik.df=best.model.logLik.df[1:6,]
ggplot(best.model.logLik.df, aes(x=topics, y=LL)) + 
  xlab("Number of topics") + ylab("AIC of the model") + 
  geom_line() + 
  theme_bw() 

決定分成6群

進行分群

m_lda <- LDA(dtm, k = 6, control = list(seed = 1234))# k means class

以beta判斷主題

m_lda
## A LDA_VEM topic model with 6 topics.
ap_topics <- tidy(m_lda, matrix = "beta")#每一個主題的用詞機率
options(repr.plot.width=14, repr.plot.height=10)#輸出大小
ap_top_terms <- ap_topics %>%
  group_by(topic) %>% #分組 
  top_n(10, beta) %>% #出現機率 
  ungroup() %>%
  arrange(topic, -beta)


ap_top_terms$topic=ifelse(ap_top_terms$topic==1,"1.船隻原料",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==2,"2.馬頭1",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==3,"3.客戶關係",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==4,"4.馬頭2",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==5,"5.天氣",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==6,"6.颱風",ap_top_terms$topic)
ap_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") + #一分二
  coord_flip() + #翻轉座標
  scale_x_reordered() #去掉字尾

計算gamma

將個文章判定為機率最大的主題

ap_documents <- tidy(m_lda, matrix = "gamma")
m_gamma<-pivot_wider(ap_documents,names_from = topic, values_from = gamma)
m_gamma$document=as.numeric(m_gamma$document)
document_topic=data.frame(
  document=m_gamma$document
)
for(i in 1:nrow(document_topic)){
  document_topic$topic[i]=which.max(m_gamma[i,2:ncol(m_gamma)])
}
news_time=data.frame(date=original_data$date,
                     document=original_data$news_id)
document_topic<-inner_join(document_topic,news_time,by=c("document"="document"))

與網路標籤比較

tag <- read.csv("C:/Users/VivoBook/Desktop/study/china_steel/join/data_318/inchcape_0428news.csv")
document_topic$tage=c()
id=document_topic$document[1]
title=original_data$title[which(original_data$news_id==id)]
document_topic$tage=tag$tag_name[which(tag$title==title)]

for(i in c(c(1:180),c(182:nrow(document_topic)))){ #181沒有標籤要跳過
id=document_topic$document[i]
title=original_data$title[which(original_data$news_id==id)]
document_topic$tage[i]=tag$tag_name[which(tag$title==title)]
}
(document_topic)[46:50,]
##    document topic       date                                      tage
## 46       74     4 2019-09-21                             Newsletters  
## 47       84     2 2019-08-31     Port Advisories, Weather Advisories  
## 48       91     2 2019-08-03                             Newsletters  
## 49      113     2 2019-06-22                             Newsletters  
## 50      132     6 2019-05-24                         Port Advisories
document_topic2=(document_topic)

分割網路標籤

document_topic2$tage=as.character(document_topic2$tage)
document_topic2$tage<-str_replace_all(document_topic2$tage," ","")
document_topic2$tage1=NA
document_topic2$tage2=NA
document_topic2$tage3=NA
i=1
for(i in 1:nrow(document_topic)){
tmp=strsplit(document_topic2$tage[i],split=",")
L=length(tmp[[1]])
if(L>0){
  for(j in 1:L){
    document_topic2[i,j+4]=tmp[[1]][j]
  }
}
}
tage_name=names(table(c(names(table(document_topic2$tage1)),names(table(document_topic2$tage2)),names(table(document_topic2$tage3)))))
(document_topic2)[46:50,]
##    document topic       date                             tage
## 46       74     4 2019-09-21                      Newsletters
## 47       84     2 2019-08-31 PortAdvisories,WeatherAdvisories
## 48       91     2 2019-08-03                      Newsletters
## 49      113     2 2019-06-22                      Newsletters
## 50      132     6 2019-05-24                   PortAdvisories
##             tage1             tage2 tage3
## 46    Newsletters              <NA>  <NA>
## 47 PortAdvisories WeatherAdvisories  <NA>
## 48    Newsletters              <NA>  <NA>
## 49    Newsletters              <NA>  <NA>
## 50 PortAdvisories              <NA>  <NA>

LDA 與標籤比較

con=matrix(0,6,8)
colnames(con)=tage_name      
rownames(con)=c("1.船隻原料","2.碼頭1","3.客戶關係","4.碼頭2","5.天氣","6.颱風")
for(k in 1:nrow(document_topic2)){
i<-document_topic2$topic[k]
j=which(tage_name==document_topic2$tage1[k])
con[i,j]=con[i,j]+1
}
k=1
for(k in 1:nrow(document_topic2)){
  if(is.na(document_topic2$tage2[k])==F){
    i<-document_topic2$topic[k]
    j=which(tage_name==document_topic2$tage2[k])
    con[i,j]=con[i,j]+1
  }
}

for(k in 1:nrow(document_topic2)){
  if(is.na(document_topic2$tage3[k])==F){
    i<-document_topic2$topic[k]
    j=which(tage_name==document_topic2$tage3[k])
    con[i,j]=con[i,j]+1
  }
}
colnames(con)=c("COVID-19","海事檢驗","市場情報","時事通訊","港口諮詢",
                "綜合報導","產品服務新聞","天氣")
con
##            COVID-19 海事檢驗 市場情報 時事通訊 港口諮詢 綜合報導
## 1.船隻原料        0        0        0       24        0        0
## 2.碼頭1           0        2        1       26        4        1
## 3.客戶關係        2       13       10        2       66       13
## 4.碼頭2           0        0        4       12       11        4
## 5.天氣            0        0        0        0        3        0
## 6.颱風            1        4        5        2       53        3
##            產品服務新聞 天氣
## 1.船隻原料            0    0
## 2.碼頭1               1    1
## 3.客戶關係            6   16
## 4.碼頭2               5    8
## 5.天氣                1    5
## 6.颱風                0   61