library(httr)
library(rvest)
library(cleanNLP)
getArticleContent <- function(url){
response <- GET(url, user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15"))
page <- read_html(response)
article_content <- html_text(html_nodes(page, "p.txt"), trim = TRUE)
cleaned_content <- trimws(article_content)
cleaned_content <- paste(cleaned_content, collapse = "")
}
article_urls <- c()
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241110/p2g/00m/0li/028000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241111/p2g/00m/0na/048000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241112/p2g/00m/0sp/005000c")
length(article_urls)
[1] 4
#Tokenization (形態素解析)
contents <- lapply(article_urls, getArticleContent)
annotedData<-cnlp_annotate(input = contents)$token
dim(annotedData)
[1] 909 11
### 文書単語行列
docMtx <- as.data.frame.matrix(table(annotedData$lemma, annotedData$doc_id))
View(docMtx)
res1<-annotedData$token
res1[1:10]
[1] "TOKYO" "--" "An" "art" "deco" "building" "in" "the"
[9] "Japanese" "capital"
res2<-annotedData[[4]]
res2[1:10]
[1] "TOKYO" "--" "An" "art" "deco" "building" "in" "the"
[9] "Japanese" "capital"
res3<-annotedData[4]
res3[1:10,]
colnames(annotedData)
[1] "doc_id" "sid" "tid" "token" "token_with_ws" "lemma"
[7] "upos" "xpos" "feats" "tid_source" "relation"
which(colnames(annotedData)=="token")
[1] 4
article1[1:10]
[1] "TOKYO" "--" "An" "art" "deco" "building" "in" "the"
[9] "Japanese" "capital"
freq_data<-sort(table(article1), decreasing=TRUE)
freq_data[1:20]
article1
the . , of a in 's * as and was
17 12 9 8 7 7 6 6 6 5 5
building The with - ( ) facility is Japan
4 4 4 3 3 3 3 3 3
node <- "article"
grep(node, article1, value=T)
[1] "article" "articles"
(nodeLst <- grep("^article$",article1, value=T))
[1] "article"
node <- "article"
paste0("^", node,"$")
[1] "^article$"
node <- "facility"
search_node <- paste0("^", node,"$")
(nodeIndex <- grep(search_node,article1, ignore.case = T))
[1] 97 164 251
Left1 <- article1[nodeIndex-1]
Left2 <- article1[nodeIndex-2]
Right1 <- article1[nodeIndex+1]
Right2 <- article1[nodeIndex+2]
cbind(Left2, Left1, node, Right1, Right2)
Left2 Left1 node Right1 Right2
[1,] "." "The" "facility" "has" "a"
[2,] "At" "the" "facility" "'s" "entrance"
[3,] "inside" "the" "facility" "." "("
collo <- data.frame(cbind(Left2, Left1, node, Right1, Right2))
colnames(collo) <- c("2L","1L","node","1R","2R")
rownames(collo) <- seq(dim(collo)[1])
collo
size <- 4
colloLst <- c()
len<-length(article1)-size+1
for(i in nodeIndex) {
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
}
colloLst <- data.frame(colloLst)
colnames(colloLst) <- c(paste0(seq(size, 1, -1),"L"),"node",paste0(seq(1,size),"R"))
rownames(colloLst) <- seq(dim(colloLst)[1])
colloLst
size <- 4
node <- "facility"
search_node <- paste0("^", node,"$")
(nodeIndex <- grep(search_node,article1, ignore.case = T))
[1] 97 164 251
colloLst <- c()
#nodeIndex[1]
(i=nodeIndex[1])
[1] 97
article1[(i-size):(i+size)]
[1] "Shoseki" "Co" "." "The" "facility" "has"
[7] "a" "large" "collection"
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] "Shoseki" "Co" "." "The" "facility" "has" "a" "large" "collection"
#nodeIndex[2]
(i=nodeIndex[2])
[1] 164
article1[(i-size):(i+size)]
[1] "geometry" "." "At" "the" "facility" "'s" "entrance"
[8] "," "two"
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] "Shoseki" "Co" "." "The" "facility" "has" "a" "large" "collection"
[2,] "geometry" "." "At" "the" "facility" "'s" "entrance" "," "two"
#nodeIndex[3]
(i=nodeIndex[3])
[1] 251
article1[(i-size):(i+size)]
[1] "or" "seeing" "inside" "the" "facility" "." "("
[8] "Japanese" "original"
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] "Shoseki" "Co" "." "The" "facility" "has" "a" "large"
[2,] "geometry" "." "At" "the" "facility" "'s" "entrance" ","
[3,] "or" "seeing" "inside" "the" "facility" "." "(" "Japanese"
[,9]
[1,] "collection"
[2,] "two"
[3,] "original"
node_list = c("of", "art", "Japan")
source("getCollo.R")
getCollo(articleLst[[2]], current_node=node_list[1])
3L 2L 1L node 1R 2R 3R
1 "--" "A" "team" "of" "researchers" "," "including"
2 "explore" "the" "possibility" "of" "(" "health" ")"
3 "after" "a" "month" "of" "treatment" "with" "low"
getCollo(articleLst[[4]], current_node=node_list[3], span=5)
5L 4L 3L 2L 1L node 1R 2R
1 "(" "Kyodo" ")" "--" "Former" "Japan" "striker" "Kazuyoshi"
2 "for" "Atletico" "Suzuka" "in" "the" "Japan" "Football" "League"
3 "Oliveirense.The" "Shizuoka" "Prefecture" "native" "left" "Japan" "at" "15"
4 "1986" "." "After" "returning" "to" "Japan" "in" "1990"
3R 4R 5R
1 "Miura" "said" "Monday"
2 "next" "year" "in"
3 "for" "Brazil" ","
4 "," "Miura" "won"
getCollo(articleLst[[4]], current_node=node_list[2])
[1] "The search term does not appear in this article"
NULL