Inchape shipping service :
https://www.iss-shipping.com/news-and-media
時間:
2018-05-30~2020-03-19(共323篇)
require("tm")
library(qdap)
library(tidyr)
library(dplyr)
require(readr)
require(lubridate)
require(stringr)
library(textstem)
library(tidytext)
library(topicmodels)
library(tidyverse)
original_data<-read_csv("C:/Users/VivoBook/Desktop/study/china_steel/join/data_318/allnewdata.csv" ,col_names = T,
,col_types = cols( date = col_character(), title =col_character(),body =col_character())
,locale = locale(encoding = "Big-5"))
original_data$news_id=c(1:nrow(original_data)) #標註新聞ID
original_data$date<-ymd(original_data$date) #時間轉格式
original_data$date=original_data$date+1 #時間+1天
news_range<-range(original_data$date) #"2018-05-31" "2020-03-20"#
original_data$body<-str_replace_all(original_data$body,"[[:punct:]]","") #刪除標點
var_stopwords<-stopwords()
tmp=original_data %>%unnest_tokens(word, body) %>% #斷詞
filter(!grepl("[^a-zA-Z]",word))%>%
mutate(word=lemmatize_strings(word))%>% #還原詞幹
group_by(word,news_id)%>%
summarise(count=n())
## Registered S3 methods overwritten by 'textclean':
## method from
## print.check_text qdap
## print.sub_holder qdap
id=which(tmp$word%in%var_stopwords) #去除停用字
tfm=tmp[-id,]
tfm[100:105,]
## # A tibble: 6 x 3
## # Groups: word [4]
## word news_id count
## <chr> <int> <int>
## 1 aaprnwoodside 98 1
## 2 aaron 66 1
## 3 aaron 76 1
## 4 abaco 79 2
## 5 abaco 82 2
## 6 abadi 95 4
dtm<-cast_dtm(tfm,news_id,word,count)
dtm
## <<DocumentTermMatrix (documents: 323, terms: 9903)>>
## Non-/sparse entries: 66833/3131836
## Sparsity : 98%
## Maximal term length: NA
## Weighting : term frequency (tf)
news_time=original_data$date
best.model <- lapply(seq(2,22, by=4), function(k){LDA(dtm, k,control = list(seed = 1234))})
beepr::beep(8)
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, AIC)))
best.model.logLik.df <- data.frame(topics=seq(2,22, by=4), LL=as.numeric(as.matrix(best.model.logLik)))
best.model.logLik.df=best.model.logLik.df[1:6,]
ggplot(best.model.logLik.df, aes(x=topics, y=LL)) +
xlab("Number of topics") + ylab("AIC of the model") +
geom_line() +
theme_bw()
決定分成6群
m_lda <- LDA(dtm, k = 6, control = list(seed = 1234))# k means class
m_lda
## A LDA_VEM topic model with 6 topics.
ap_topics <- tidy(m_lda, matrix = "beta")#每一個主題的用詞機率
options(repr.plot.width=14, repr.plot.height=10)#輸出大小
ap_top_terms <- ap_topics %>%
group_by(topic) %>% #分組
top_n(10, beta) %>% #出現機率
ungroup() %>%
arrange(topic, -beta)
ap_top_terms$topic=ifelse(ap_top_terms$topic==1,"1.船隻原料",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==2,"2.馬頭1",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==3,"3.客戶關係",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==4,"4.馬頭2",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==5,"5.天氣",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==6,"6.颱風",ap_top_terms$topic)
ap_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + #一分二
coord_flip() + #翻轉座標
scale_x_reordered() #去掉字尾
將個文章判定為機率最大的主題
ap_documents <- tidy(m_lda, matrix = "gamma")
m_gamma<-pivot_wider(ap_documents,names_from = topic, values_from = gamma)
m_gamma$document=as.numeric(m_gamma$document)
document_topic=data.frame(
document=m_gamma$document
)
for(i in 1:nrow(document_topic)){
document_topic$topic[i]=which.max(m_gamma[i,2:ncol(m_gamma)])
}
news_time=data.frame(date=original_data$date,
document=original_data$news_id)
document_topic<-inner_join(document_topic,news_time,by=c("document"="document"))
tag <- read.csv("C:/Users/VivoBook/Desktop/study/china_steel/join/data_318/inchcape_0428news.csv")
document_topic$tage=c()
id=document_topic$document[1]
title=original_data$title[which(original_data$news_id==id)]
document_topic$tage=tag$tag_name[which(tag$title==title)]
for(i in c(c(1:180),c(182:nrow(document_topic)))){ #181沒有標籤要跳過
id=document_topic$document[i]
title=original_data$title[which(original_data$news_id==id)]
document_topic$tage[i]=tag$tag_name[which(tag$title==title)]
}
(document_topic)[46:50,]
## document topic date tage
## 46 74 4 2019-09-21 Newsletters
## 47 84 2 2019-08-31 Port Advisories, Weather Advisories
## 48 91 2 2019-08-03 Newsletters
## 49 113 2 2019-06-22 Newsletters
## 50 132 6 2019-05-24 Port Advisories
document_topic2=(document_topic)
document_topic2$tage=as.character(document_topic2$tage)
document_topic2$tage<-str_replace_all(document_topic2$tage," ","")
document_topic2$tage1=NA
document_topic2$tage2=NA
document_topic2$tage3=NA
i=1
for(i in 1:nrow(document_topic)){
tmp=strsplit(document_topic2$tage[i],split=",")
L=length(tmp[[1]])
if(L>0){
for(j in 1:L){
document_topic2[i,j+4]=tmp[[1]][j]
}
}
}
tage_name=names(table(c(names(table(document_topic2$tage1)),names(table(document_topic2$tage2)),names(table(document_topic2$tage3)))))
(document_topic2)[46:50,]
## document topic date tage
## 46 74 4 2019-09-21 Newsletters
## 47 84 2 2019-08-31 PortAdvisories,WeatherAdvisories
## 48 91 2 2019-08-03 Newsletters
## 49 113 2 2019-06-22 Newsletters
## 50 132 6 2019-05-24 PortAdvisories
## tage1 tage2 tage3
## 46 Newsletters <NA> <NA>
## 47 PortAdvisories WeatherAdvisories <NA>
## 48 Newsletters <NA> <NA>
## 49 Newsletters <NA> <NA>
## 50 PortAdvisories <NA> <NA>
con=matrix(0,6,8)
colnames(con)=tage_name
rownames(con)=c("1.船隻原料","2.碼頭1","3.客戶關係","4.碼頭2","5.天氣","6.颱風")
for(k in 1:nrow(document_topic2)){
i<-document_topic2$topic[k]
j=which(tage_name==document_topic2$tage1[k])
con[i,j]=con[i,j]+1
}
k=1
for(k in 1:nrow(document_topic2)){
if(is.na(document_topic2$tage2[k])==F){
i<-document_topic2$topic[k]
j=which(tage_name==document_topic2$tage2[k])
con[i,j]=con[i,j]+1
}
}
for(k in 1:nrow(document_topic2)){
if(is.na(document_topic2$tage3[k])==F){
i<-document_topic2$topic[k]
j=which(tage_name==document_topic2$tage3[k])
con[i,j]=con[i,j]+1
}
}
colnames(con)=c("COVID-19","海事檢驗","市場情報","時事通訊","港口諮詢",
"綜合報導","產品服務新聞","天氣")
con
## COVID-19 海事檢驗 市場情報 時事通訊 港口諮詢 綜合報導
## 1.船隻原料 0 0 0 24 0 0
## 2.碼頭1 0 2 1 26 4 1
## 3.客戶關係 2 13 10 2 66 13
## 4.碼頭2 0 0 4 12 11 4
## 5.天氣 0 0 0 0 3 0
## 6.颱風 1 4 5 2 53 3
## 產品服務新聞 天氣
## 1.船隻原料 0 0
## 2.碼頭1 1 1
## 3.客戶關係 6 16
## 4.碼頭2 5 8
## 5.天氣 1 5
## 6.颱風 0 61
data=(as.matrix(dtm))
all_topic=list()
i=1
for(i in 1:6){
all_topic[[i]]=document_topic2$document[which(document_topic2$topic==i)]
}
i=7
for(i in 7:14){
tmp_name=tage_name[i-6]
all_topic[[i]]=document_topic2$document[which(tmp_name==document_topic2$tage1|tmp_name==document_topic2$tage2|tmp_name==document_topic2$tage3)]
}
topic_tf=as.data.frame(matrix(NA,14,ncol(data)))
i=1
for(i in 1:14){
tmp=data[which(rownames(data)%in%all_topic[[i]]),]
topic_tf[i,]=apply(tmp, 2, sum)
}
topic_tf=(t(topic_tf))
cor_topic=cor(topic_tf)
cor_topic
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.00000000 0.3636265 0.2554635 0.4155203 0.07097942 0.1307479
## [2,] 0.36362648 1.0000000 0.6194037 0.9558538 0.21957671 0.5223349
## [3,] 0.25546353 0.6194037 1.0000000 0.6656802 0.30348673 0.8224013
## [4,] 0.41552025 0.9558538 0.6656802 1.0000000 0.25512822 0.5476796
## [5,] 0.07097942 0.2195767 0.3034867 0.2551282 1.00000000 0.3733969
## [6,] 0.13074788 0.5223349 0.8224013 0.5476796 0.37339694 1.0000000
## [7,] 0.16074140 0.3644142 0.6410622 0.3898636 0.16993453 0.5483219
## [8,] 0.10624386 0.4691816 0.8283379 0.4860772 0.26545647 0.9369213
## [9,] 0.15198843 0.5668416 0.8443384 0.5881096 0.28057730 0.9561946
## [10,] 0.86680910 0.7775343 0.5094688 0.7992223 0.17024351 0.3769534
## [11,] 0.22627000 0.6464669 0.9058687 0.6885227 0.41548872 0.9469004
## [12,] 0.12038044 0.4874689 0.8269835 0.5079981 0.24876275 0.9373855
## [13,] 0.21843257 0.4960621 0.6245072 0.5367192 0.14433248 0.4237950
## [14,] 0.16416996 0.5806596 0.8315330 0.6118637 0.46671917 0.9826081
## [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 0.1607414 0.1062439 0.1519884 0.8668091 0.2262700 0.1203804
## [2,] 0.3644142 0.4691816 0.5668416 0.7775343 0.6464669 0.4874689
## [3,] 0.6410622 0.8283379 0.8443384 0.5094688 0.9058687 0.8269835
## [4,] 0.3898636 0.4860772 0.5881096 0.7992223 0.6885227 0.5079981
## [5,] 0.1699345 0.2654565 0.2805773 0.1702435 0.4154887 0.2487627
## [6,] 0.5483219 0.9369213 0.9561946 0.3769534 0.9469004 0.9373855
## [7,] 1.0000000 0.5864853 0.5630599 0.3069956 0.5692915 0.6270960
## [8,] 0.5864853 1.0000000 0.9470196 0.3301444 0.8618255 0.9684688
## [9,] 0.5630599 0.9470196 1.0000000 0.4124739 0.9104776 0.9528173
## [10,] 0.3069956 0.3301444 0.4124739 1.0000000 0.5060638 0.3497688
## [11,] 0.5692915 0.8618255 0.9104776 0.5060638 1.0000000 0.8620074
## [12,] 0.6270960 0.9684688 0.9528173 0.3497688 0.8620074 1.0000000
## [13,] 0.4131537 0.5016472 0.4436227 0.4139623 0.5018707 0.5136955
## [14,] 0.5366393 0.8878157 0.9182934 0.4301288 0.9646067 0.8877398
## [,13] [,14]
## [1,] 0.2184326 0.1641700
## [2,] 0.4960621 0.5806596
## [3,] 0.6245072 0.8315330
## [4,] 0.5367192 0.6118637
## [5,] 0.1443325 0.4667192
## [6,] 0.4237950 0.9826081
## [7,] 0.4131537 0.5366393
## [8,] 0.5016472 0.8878157
## [9,] 0.4436227 0.9182934
## [10,] 0.4139623 0.4301288
## [11,] 0.5018707 0.9646067
## [12,] 0.5136955 0.8877398
## [13,] 1.0000000 0.4439953
## [14,] 0.4439953 1.0000000
### 匯入社會網路package
library(devtools)
library("UserNetR", lib.loc="~/R/win-library/3.6")
library(statnet)
library(UserNetR)
(standard=summary(as.vector(cor_topic)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.07098 0.37340 0.53668 0.57579 0.86183 1.00000
netmat1=matrix(0,14,14)
netmat1[cor_topic>standard[5]]=1
rownames(netmat1) <- c("船隻原料","碼頭1","客戶關係","碼頭2","天氣","颱風",
"COVID-19","海事檢驗","市場情報","時事通訊","港口諮詢",
"綜合報導","產品服務新聞","天氣")
colnames(netmat1) <- c("船隻原料","碼頭1","客戶關係","碼頭2","天氣","颱風",
"COVID-19","海事檢驗","市場情報","時事通訊","港口諮詢",
"綜合報導","產品服務新聞","天氣")
net1 <- network(netmat1,matrix.type="adjacency")
紅色為LDA分類
藍色為網路分類
節點大小為 文章數量
colo=c(rep("red",6),rep("blue",8))
vector_size=c()
for(i in 1:14){
vector_size[i]=length(all_topic[[i]])
}
plot(net1, displaylabel = TRUE,
vertex.cex=log10(vector_size),
vertex.col=colo)