資料來源

Inchape shipping service :
https://www.iss-shipping.com/news-and-media
時間:
2018-05-30~2020-03-19(共323篇)

匯入 package

require("tm")
library(qdap)
library(tidyr)
library(dplyr)
require(readr)
require(lubridate)
require(stringr)
library(textstem)
library(tidytext)
library(topicmodels)
library(tidyverse)

匯入資料

original_data<-read_csv("C:/Users/VivoBook/Desktop/study/china_steel/join/data_318/allnewdata.csv" ,col_names = T,
              ,col_types = cols( date = col_character(), title =col_character(),body =col_character())
              ,locale = locale(encoding = "Big-5"))

前處理

original_data$news_id=c(1:nrow(original_data))   #標註新聞ID
original_data$date<-ymd(original_data$date)      #時間轉格式
original_data$date=original_data$date+1          #時間+1天
news_range<-range(original_data$date)            #"2018-05-31" "2020-03-20"#
original_data$body<-str_replace_all(original_data$body,"[[:punct:]]","") #刪除標點

斷詞

var_stopwords<-stopwords()
tmp=original_data %>%unnest_tokens(word, body) %>% #斷詞
  filter(!grepl("[^a-zA-Z]",word))%>%     
  mutate(word=lemmatize_strings(word))%>% #還原詞幹
  group_by(word,news_id)%>%            
  summarise(count=n()) 
## Registered S3 methods overwritten by 'textclean':
##   method           from
##   print.check_text qdap
##   print.sub_holder qdap
id=which(tmp$word%in%var_stopwords)       #去除停用字
tfm=tmp[-id,]
tfm[100:105,]
## # A tibble: 6 x 3
## # Groups:   word [4]
##   word          news_id count
##   <chr>           <int> <int>
## 1 aaprnwoodside      98     1
## 2 aaron              66     1
## 3 aaron              76     1
## 4 abaco              79     2
## 5 abaco              82     2
## 6 abadi              95     4

資料格式轉DTM

dtm<-cast_dtm(tfm,news_id,word,count)
dtm
## <<DocumentTermMatrix (documents: 323, terms: 9903)>>
## Non-/sparse entries: 66833/3131836
## Sparsity           : 98%
## Maximal term length: NA
## Weighting          : term frequency (tf)
news_time=original_data$date

以LDA分群

以AIC判定分群數量

best.model <- lapply(seq(2,22, by=4), function(k){LDA(dtm, k,control = list(seed = 1234))})
beepr::beep(8)
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, AIC)))
best.model.logLik.df <- data.frame(topics=seq(2,22, by=4), LL=as.numeric(as.matrix(best.model.logLik)))
best.model.logLik.df=best.model.logLik.df[1:6,]
ggplot(best.model.logLik.df, aes(x=topics, y=LL)) + 
  xlab("Number of topics") + ylab("AIC of the model") + 
  geom_line() + 
  theme_bw() 

決定分成6群

進行分群

m_lda <- LDA(dtm, k = 6, control = list(seed = 1234))# k means class

以beta判斷主題

m_lda
## A LDA_VEM topic model with 6 topics.
ap_topics <- tidy(m_lda, matrix = "beta")#每一個主題的用詞機率
options(repr.plot.width=14, repr.plot.height=10)#輸出大小
ap_top_terms <- ap_topics %>%
  group_by(topic) %>% #分組 
  top_n(10, beta) %>% #出現機率 
  ungroup() %>%
  arrange(topic, -beta)


ap_top_terms$topic=ifelse(ap_top_terms$topic==1,"1.船隻原料",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==2,"2.馬頭1",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==3,"3.客戶關係",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==4,"4.馬頭2",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==5,"5.天氣",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==6,"6.颱風",ap_top_terms$topic)
ap_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") + #一分二
  coord_flip() + #翻轉座標
  scale_x_reordered() #去掉字尾

計算gamma

將個文章判定為機率最大的主題

ap_documents <- tidy(m_lda, matrix = "gamma")
m_gamma<-pivot_wider(ap_documents,names_from = topic, values_from = gamma)
m_gamma$document=as.numeric(m_gamma$document)
document_topic=data.frame(
  document=m_gamma$document
)
for(i in 1:nrow(document_topic)){
  document_topic$topic[i]=which.max(m_gamma[i,2:ncol(m_gamma)])
}
news_time=data.frame(date=original_data$date,
                     document=original_data$news_id)
document_topic<-inner_join(document_topic,news_time,by=c("document"="document"))

與網路標籤比較

tag <- read.csv("C:/Users/VivoBook/Desktop/study/china_steel/join/data_318/inchcape_0428news.csv")
document_topic$tage=c()
id=document_topic$document[1]
title=original_data$title[which(original_data$news_id==id)]
document_topic$tage=tag$tag_name[which(tag$title==title)]

for(i in c(c(1:180),c(182:nrow(document_topic)))){ #181沒有標籤要跳過
id=document_topic$document[i]
title=original_data$title[which(original_data$news_id==id)]
document_topic$tage[i]=tag$tag_name[which(tag$title==title)]
}
(document_topic)[46:50,]
##    document topic       date                                      tage
## 46       74     4 2019-09-21                             Newsletters  
## 47       84     2 2019-08-31     Port Advisories, Weather Advisories  
## 48       91     2 2019-08-03                             Newsletters  
## 49      113     2 2019-06-22                             Newsletters  
## 50      132     6 2019-05-24                         Port Advisories
document_topic2=(document_topic)

分割網路標籤

document_topic2$tage=as.character(document_topic2$tage)
document_topic2$tage<-str_replace_all(document_topic2$tage," ","")
document_topic2$tage1=NA
document_topic2$tage2=NA
document_topic2$tage3=NA
i=1
for(i in 1:nrow(document_topic)){
tmp=strsplit(document_topic2$tage[i],split=",")
L=length(tmp[[1]])
if(L>0){
  for(j in 1:L){
    document_topic2[i,j+4]=tmp[[1]][j]
  }
}
}
tage_name=names(table(c(names(table(document_topic2$tage1)),names(table(document_topic2$tage2)),names(table(document_topic2$tage3)))))
(document_topic2)[46:50,]
##    document topic       date                             tage
## 46       74     4 2019-09-21                      Newsletters
## 47       84     2 2019-08-31 PortAdvisories,WeatherAdvisories
## 48       91     2 2019-08-03                      Newsletters
## 49      113     2 2019-06-22                      Newsletters
## 50      132     6 2019-05-24                   PortAdvisories
##             tage1             tage2 tage3
## 46    Newsletters              <NA>  <NA>
## 47 PortAdvisories WeatherAdvisories  <NA>
## 48    Newsletters              <NA>  <NA>
## 49    Newsletters              <NA>  <NA>
## 50 PortAdvisories              <NA>  <NA>

LDA 與標籤比較

con=matrix(0,6,8)
colnames(con)=tage_name      
rownames(con)=c("1.船隻原料","2.碼頭1","3.客戶關係","4.碼頭2","5.天氣","6.颱風")
for(k in 1:nrow(document_topic2)){
i<-document_topic2$topic[k]
j=which(tage_name==document_topic2$tage1[k])
con[i,j]=con[i,j]+1
}
k=1
for(k in 1:nrow(document_topic2)){
  if(is.na(document_topic2$tage2[k])==F){
    i<-document_topic2$topic[k]
    j=which(tage_name==document_topic2$tage2[k])
    con[i,j]=con[i,j]+1
  }
}

for(k in 1:nrow(document_topic2)){
  if(is.na(document_topic2$tage3[k])==F){
    i<-document_topic2$topic[k]
    j=which(tage_name==document_topic2$tage3[k])
    con[i,j]=con[i,j]+1
  }
}
colnames(con)=c("COVID-19","海事檢驗","市場情報","時事通訊","港口諮詢",
                "綜合報導","產品服務新聞","天氣")
con
##            COVID-19 海事檢驗 市場情報 時事通訊 港口諮詢 綜合報導
## 1.船隻原料        0        0        0       24        0        0
## 2.碼頭1           0        2        1       26        4        1
## 3.客戶關係        2       13       10        2       66       13
## 4.碼頭2           0        0        4       12       11        4
## 5.天氣            0        0        0        0        3        0
## 6.颱風            1        4        5        2       53        3
##            產品服務新聞 天氣
## 1.船隻原料            0    0
## 2.碼頭1               1    1
## 3.客戶關係            6   16
## 4.碼頭2               5    8
## 5.天氣                1    5
## 6.颱風                0   61

以社會網路觀察主題間關係

計各個主題用字相關性

data=(as.matrix(dtm))
all_topic=list()
i=1
for(i in 1:6){
  all_topic[[i]]=document_topic2$document[which(document_topic2$topic==i)]
}
i=7
for(i in 7:14){
  tmp_name=tage_name[i-6]
  all_topic[[i]]=document_topic2$document[which(tmp_name==document_topic2$tage1|tmp_name==document_topic2$tage2|tmp_name==document_topic2$tage3)]
}
topic_tf=as.data.frame(matrix(NA,14,ncol(data)))
i=1
for(i in 1:14){
  tmp=data[which(rownames(data)%in%all_topic[[i]]),]
  topic_tf[i,]=apply(tmp, 2, sum)
}
topic_tf=(t(topic_tf))
cor_topic=cor(topic_tf)
cor_topic
##             [,1]      [,2]      [,3]      [,4]       [,5]      [,6]
##  [1,] 1.00000000 0.3636265 0.2554635 0.4155203 0.07097942 0.1307479
##  [2,] 0.36362648 1.0000000 0.6194037 0.9558538 0.21957671 0.5223349
##  [3,] 0.25546353 0.6194037 1.0000000 0.6656802 0.30348673 0.8224013
##  [4,] 0.41552025 0.9558538 0.6656802 1.0000000 0.25512822 0.5476796
##  [5,] 0.07097942 0.2195767 0.3034867 0.2551282 1.00000000 0.3733969
##  [6,] 0.13074788 0.5223349 0.8224013 0.5476796 0.37339694 1.0000000
##  [7,] 0.16074140 0.3644142 0.6410622 0.3898636 0.16993453 0.5483219
##  [8,] 0.10624386 0.4691816 0.8283379 0.4860772 0.26545647 0.9369213
##  [9,] 0.15198843 0.5668416 0.8443384 0.5881096 0.28057730 0.9561946
## [10,] 0.86680910 0.7775343 0.5094688 0.7992223 0.17024351 0.3769534
## [11,] 0.22627000 0.6464669 0.9058687 0.6885227 0.41548872 0.9469004
## [12,] 0.12038044 0.4874689 0.8269835 0.5079981 0.24876275 0.9373855
## [13,] 0.21843257 0.4960621 0.6245072 0.5367192 0.14433248 0.4237950
## [14,] 0.16416996 0.5806596 0.8315330 0.6118637 0.46671917 0.9826081
##            [,7]      [,8]      [,9]     [,10]     [,11]     [,12]
##  [1,] 0.1607414 0.1062439 0.1519884 0.8668091 0.2262700 0.1203804
##  [2,] 0.3644142 0.4691816 0.5668416 0.7775343 0.6464669 0.4874689
##  [3,] 0.6410622 0.8283379 0.8443384 0.5094688 0.9058687 0.8269835
##  [4,] 0.3898636 0.4860772 0.5881096 0.7992223 0.6885227 0.5079981
##  [5,] 0.1699345 0.2654565 0.2805773 0.1702435 0.4154887 0.2487627
##  [6,] 0.5483219 0.9369213 0.9561946 0.3769534 0.9469004 0.9373855
##  [7,] 1.0000000 0.5864853 0.5630599 0.3069956 0.5692915 0.6270960
##  [8,] 0.5864853 1.0000000 0.9470196 0.3301444 0.8618255 0.9684688
##  [9,] 0.5630599 0.9470196 1.0000000 0.4124739 0.9104776 0.9528173
## [10,] 0.3069956 0.3301444 0.4124739 1.0000000 0.5060638 0.3497688
## [11,] 0.5692915 0.8618255 0.9104776 0.5060638 1.0000000 0.8620074
## [12,] 0.6270960 0.9684688 0.9528173 0.3497688 0.8620074 1.0000000
## [13,] 0.4131537 0.5016472 0.4436227 0.4139623 0.5018707 0.5136955
## [14,] 0.5366393 0.8878157 0.9182934 0.4301288 0.9646067 0.8877398
##           [,13]     [,14]
##  [1,] 0.2184326 0.1641700
##  [2,] 0.4960621 0.5806596
##  [3,] 0.6245072 0.8315330
##  [4,] 0.5367192 0.6118637
##  [5,] 0.1443325 0.4667192
##  [6,] 0.4237950 0.9826081
##  [7,] 0.4131537 0.5366393
##  [8,] 0.5016472 0.8878157
##  [9,] 0.4436227 0.9182934
## [10,] 0.4139623 0.4301288
## [11,] 0.5018707 0.9646067
## [12,] 0.5136955 0.8877398
## [13,] 1.0000000 0.4439953
## [14,] 0.4439953 1.0000000

### 匯入社會網路package

library(devtools)
library("UserNetR", lib.loc="~/R/win-library/3.6")
library(statnet)
library(UserNetR)

定義網路連邊

(standard=summary(as.vector(cor_topic)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.07098 0.37340 0.53668 0.57579 0.86183 1.00000
netmat1=matrix(0,14,14)
netmat1[cor_topic>standard[5]]=1
rownames(netmat1) <- c("船隻原料","碼頭1","客戶關係","碼頭2","天氣","颱風",
                       "COVID-19","海事檢驗","市場情報","時事通訊","港口諮詢",
  "綜合報導","產品服務新聞","天氣")
colnames(netmat1) <- c("船隻原料","碼頭1","客戶關係","碼頭2","天氣","颱風",
                       "COVID-19","海事檢驗","市場情報","時事通訊","港口諮詢",
  "綜合報導","產品服務新聞","天氣")
net1 <- network(netmat1,matrix.type="adjacency")

紅色為LDA分類
藍色為網路分類
節點大小為 文章數量

colo=c(rep("red",6),rep("blue",8))
vector_size=c()
for(i in 1:14){
  vector_size[i]=length(all_topic[[i]])
}

plot(net1, displaylabel = TRUE,
     vertex.cex=log10(vector_size),
     vertex.col=colo)