Inchape shipping service :
https://www.iss-shipping.com/news-and-media
時間:
2018-05-30~2020-03-19(共323篇)
require("tm")
library(qdap)
library(tidyr)
library(dplyr)
require(readr)
require(lubridate)
require(stringr)
library(textstem)
library(tidytext)
library(topicmodels)
library(tidyverse)
original_data<-read_csv("C:/Users/VivoBook/Desktop/study/china_steel/join/data_318/allnewdata.csv" ,col_names = T,
,col_types = cols( date = col_character(), title =col_character(),body =col_character())
,locale = locale(encoding = "Big-5"))
original_data$news_id=c(1:nrow(original_data)) #標註新聞ID
original_data$date<-ymd(original_data$date) #時間轉格式
original_data$date=original_data$date+1 #時間+1天
news_range<-range(original_data$date) #"2018-05-31" "2020-03-20"#
original_data$body<-str_replace_all(original_data$body,"[[:punct:]]","") #刪除標點
var_stopwords<-stopwords()
tmp=original_data %>%unnest_tokens(word, body) %>% #斷詞
filter(!grepl("[^a-zA-Z]",word))%>%
mutate(word=lemmatize_strings(word))%>% #還原詞幹
group_by(word,news_id)%>%
summarise(count=n())
## Registered S3 methods overwritten by 'textclean':
## method from
## print.check_text qdap
## print.sub_holder qdap
id=which(tmp$word%in%var_stopwords) #去除停用字
tfm=tmp[-id,]
tfm[100:105,]
## # A tibble: 6 x 3
## # Groups: word [4]
## word news_id count
## <chr> <int> <int>
## 1 aaprnwoodside 98 1
## 2 aaron 66 1
## 3 aaron 76 1
## 4 abaco 79 2
## 5 abaco 82 2
## 6 abadi 95 4
dtm<-cast_dtm(tfm,news_id,word,count)
dtm
## <<DocumentTermMatrix (documents: 323, terms: 9903)>>
## Non-/sparse entries: 66833/3131836
## Sparsity : 98%
## Maximal term length: NA
## Weighting : term frequency (tf)
news_time=original_data$date
best.model <- lapply(seq(2,22, by=4), function(k){LDA(dtm, k,control = list(seed = 1234))})
beepr::beep(8)
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, AIC)))
best.model.logLik.df <- data.frame(topics=seq(2,22, by=4), LL=as.numeric(as.matrix(best.model.logLik)))
best.model.logLik.df=best.model.logLik.df[1:6,]
ggplot(best.model.logLik.df, aes(x=topics, y=LL)) +
xlab("Number of topics") + ylab("AIC of the model") +
geom_line() +
theme_bw()
決定分成6群
m_lda <- LDA(dtm, k = 6, control = list(seed = 1234))# k means class
m_lda
## A LDA_VEM topic model with 6 topics.
ap_topics <- tidy(m_lda, matrix = "beta")#每一個主題的用詞機率
options(repr.plot.width=14, repr.plot.height=10)#輸出大小
ap_top_terms <- ap_topics %>%
group_by(topic) %>% #分組
top_n(10, beta) %>% #出現機率
ungroup() %>%
arrange(topic, -beta)
ap_top_terms$topic=ifelse(ap_top_terms$topic==1,"1.船隻原料",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==2,"2.馬頭1",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==3,"3.客戶關係",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==4,"4.馬頭2",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==5,"5.天氣",ap_top_terms$topic)
ap_top_terms$topic=ifelse(ap_top_terms$topic==6,"6.颱風",ap_top_terms$topic)
ap_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + #一分二
coord_flip() + #翻轉座標
scale_x_reordered() #去掉字尾
將個文章判定為機率最大的主題
ap_documents <- tidy(m_lda, matrix = "gamma")
m_gamma<-pivot_wider(ap_documents,names_from = topic, values_from = gamma)
m_gamma$document=as.numeric(m_gamma$document)
document_topic=data.frame(
document=m_gamma$document
)
for(i in 1:nrow(document_topic)){
document_topic$topic[i]=which.max(m_gamma[i,2:ncol(m_gamma)])
}
news_time=data.frame(date=original_data$date,
document=original_data$news_id)
document_topic<-inner_join(document_topic,news_time,by=c("document"="document"))
tag <- read.csv("C:/Users/VivoBook/Desktop/study/china_steel/join/data_318/inchcape_0428news.csv")
document_topic$tage=c()
id=document_topic$document[1]
title=original_data$title[which(original_data$news_id==id)]
document_topic$tage=tag$tag_name[which(tag$title==title)]
for(i in c(c(1:180),c(182:nrow(document_topic)))){ #181沒有標籤要跳過
id=document_topic$document[i]
title=original_data$title[which(original_data$news_id==id)]
document_topic$tage[i]=tag$tag_name[which(tag$title==title)]
}
(document_topic)[46:50,]
## document topic date tage
## 46 74 4 2019-09-21 Newsletters
## 47 84 2 2019-08-31 Port Advisories, Weather Advisories
## 48 91 2 2019-08-03 Newsletters
## 49 113 2 2019-06-22 Newsletters
## 50 132 6 2019-05-24 Port Advisories
document_topic2=(document_topic)
document_topic2$tage=as.character(document_topic2$tage)
document_topic2$tage<-str_replace_all(document_topic2$tage," ","")
document_topic2$tage1=NA
document_topic2$tage2=NA
document_topic2$tage3=NA
i=1
for(i in 1:nrow(document_topic)){
tmp=strsplit(document_topic2$tage[i],split=",")
L=length(tmp[[1]])
if(L>0){
for(j in 1:L){
document_topic2[i,j+4]=tmp[[1]][j]
}
}
}
tage_name=names(table(c(names(table(document_topic2$tage1)),names(table(document_topic2$tage2)),names(table(document_topic2$tage3)))))
(document_topic2)[46:50,]
## document topic date tage
## 46 74 4 2019-09-21 Newsletters
## 47 84 2 2019-08-31 PortAdvisories,WeatherAdvisories
## 48 91 2 2019-08-03 Newsletters
## 49 113 2 2019-06-22 Newsletters
## 50 132 6 2019-05-24 PortAdvisories
## tage1 tage2 tage3
## 46 Newsletters <NA> <NA>
## 47 PortAdvisories WeatherAdvisories <NA>
## 48 Newsletters <NA> <NA>
## 49 Newsletters <NA> <NA>
## 50 PortAdvisories <NA> <NA>
con=matrix(0,6,8)
colnames(con)=tage_name
rownames(con)=c("1.船隻原料","2.碼頭1","3.客戶關係","4.碼頭2","5.天氣","6.颱風")
for(k in 1:nrow(document_topic2)){
i<-document_topic2$topic[k]
j=which(tage_name==document_topic2$tage1[k])
con[i,j]=con[i,j]+1
}
k=1
for(k in 1:nrow(document_topic2)){
if(is.na(document_topic2$tage2[k])==F){
i<-document_topic2$topic[k]
j=which(tage_name==document_topic2$tage2[k])
con[i,j]=con[i,j]+1
}
}
for(k in 1:nrow(document_topic2)){
if(is.na(document_topic2$tage3[k])==F){
i<-document_topic2$topic[k]
j=which(tage_name==document_topic2$tage3[k])
con[i,j]=con[i,j]+1
}
}
colnames(con)=c("COVID-19","海事檢驗","市場情報","時事通訊","港口諮詢",
"綜合報導","產品服務新聞","天氣")
con
## COVID-19 海事檢驗 市場情報 時事通訊 港口諮詢 綜合報導
## 1.船隻原料 0 0 0 24 0 0
## 2.碼頭1 0 2 1 26 4 1
## 3.客戶關係 2 13 10 2 66 13
## 4.碼頭2 0 0 4 12 11 4
## 5.天氣 0 0 0 0 3 0
## 6.颱風 1 4 5 2 53 3
## 產品服務新聞 天氣
## 1.船隻原料 0 0
## 2.碼頭1 1 1
## 3.客戶關係 6 16
## 4.碼頭2 5 8
## 5.天氣 1 5
## 6.颱風 0 61