data <- c("file1", "file2", "file3")
paste(data, ".txt" , sep="")
[1] "file1.txt" "file2.txt" "file3.txt"
paste0(data, ".txt")
[1] "file1.txt" "file2.txt" "file3.txt"
lapply(data, paste, ".txt")
[[1]]
[1] "file1 .txt"
[[2]]
[1] "file2 .txt"
[[3]]
[1] "file3 .txt"
sapply(data, paste, ".txt")
file1 file2 file3
"file1 .txt" "file2 .txt" "file3 .txt"
apply(test_matrix, c(1,2), function(x) x*10)
[,1] [,2]
[1,] 10 40
[2,] 20 50
[3,] 30 60
dirName <-"testdata"
files<- list.files(dirName)
files
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
test_tests
testdata/article1
"TOKYO An art deco building in the Japanese capital"
testdata/article2
"NAGOYA Kyodo A team of researchers"
testdata/article3
"TOKYO Kyodo Former Japan striker Kazuyoshi Miura"
library(cleanNLP)
cnlp_init_udpipe()
test_res<-cnlp_annotate(input = test_tests)
View(test_res$token)
test_res$token$lemma
[1] "Tokyo" "a" "art" "deco" "building" "in"
[7] "the" "japanese" "capital" "NAGOYA" "Kyodo" "a"
[13] "team" "of" "researcher" "Tokyo" "Kyodo" "former"
[19] "Japan" "striker" "Kazuyoshi" "Miura"
test_docMtx <- as.data.frame.matrix(table(test_res$token$lemma, test_res$token$doc_id))
head(test_docMtx)
colnames(test_docMtx) <- files
head(test_docMtx)
\[w=tf*log(\frac{N}{df}) \]
\[w=tf*(log(\frac{N}{df})+1) \]
# the document numbers
N<-ncol(test_docMtx)
docFreq<-apply(test_docMtx, 1, function(x) length(x[x>0]))
head(docFreq)
tf_idf1 <- test_docMtx*log(N/docFreq)
head(tf_idf1)
tf_idf2 <- test_docMtx*(log(N/docFreq)+1)
head(tf_idf2)
library(httr)
library(rvest)
getArticleContent <- function(url){
response <- GET(url, user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15"))
page <- read_html(response)
article_content <- html_text(html_nodes(page, "p.txt"), trim = TRUE)
cleaned_content <- trimws(article_content)
cleaned_content <- paste(cleaned_content, collapse = "")
}
article_urls <- c()
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c")
#article_urls[[1]] <- "https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c"
article_urls
length(article_urls)
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241110/p2g/00m/0li/028000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241111/p2g/00m/0na/048000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241112/p2g/00m/0sp/005000c")
length(article_urls)
content1 <- getArticleContent(article_urls[1])
substring(content1, 1, 100)
contents <- lapply(article_urls, getArticleContent)
for(txt in contents) print(substring(txt, 1, 80))
length(contents)
#cnlp_init_udpipe()
res<-cnlp_annotate(input = contents)
dim(res$token)
res <- res$token[!res$token$upos %in% c("PUNCT","SYM","NUM"),]
dim(res)
View(res)
head(res[,colnames(res)=="lemma"])
res[res$lemma=="'s",]
docMtx <- as.data.frame.matrix(table(res$lemma, res$doc_id))
head(docMtx)
colnames(docMtx) <- sapply(seq(1:4), function(x) paste0("article", x))
colSums(docMtx) #Tokens
article1 article2 article3 article4
256 248 148 134
length(docMtx$article1[docMtx$article1>0])
[1] 147
length(docMtx$article1[docMtx$article1>0])
[1] 147
N<-ncol(docMtx)
docFreq<-apply(docMtx, 1, function(x) length(x[x>0]))
head(docFreq)
's 40th a about above add
3 1 4 1 1 1
tf_idf1 <- docMtx*log(N/docFreq)
head(tf_idf1)
tf_idf2 <- docMtx*(log(N/docFreq)+1)
head(tf_idf2)
article1 article2 article3 article4
's 0.02343750 0.004032258 0.00000000 0.007462687
40th 0.00000000 0.000000000 0.00000000 0.007462687
a 0.03125000 0.040322581 0.02027027 0.014925373
about 0.00390625 0.000000000 0.00000000 0.000000000
above 0.00390625 0.000000000 0.00000000 0.000000000
add 0.00390625 0.000000000 0.00000000 0.000000000
article1 article2 article3 article4
's 0.006742549 0.001160008 0 0.002146881
40th 0.000000000 0.000000000 0 0.010345480
a 0.000000000 0.000000000 0 0.000000000
about 0.005415212 0.000000000 0 0.000000000
above 0.005415212 0.000000000 0 0.000000000
add 0.005415212 0.000000000 0 0.000000000