TF-IDF

apply関数

lapply関数

data <- c("file1", "file2", "file3")
paste(data, ".txt" , sep="")
[1] "file1.txt" "file2.txt" "file3.txt"
paste0(data, ".txt")
[1] "file1.txt" "file2.txt" "file3.txt"
lapply(data, paste, ".txt")
[[1]]
[1] "file1 .txt"

[[2]]
[1] "file2 .txt"

[[3]]
[1] "file3 .txt"

sapply関数()

  • 名前の属性付き
sapply(data, paste, ".txt")
       file1        file2        file3 
"file1 .txt" "file2 .txt" "file3 .txt" 

apply関数

apply(test_matrix, c(1,2), function(x) x*10)
     [,1] [,2]
[1,]   10   40
[2,]   20   50
[3,]   30   60

テキストファイルの読み込み

ディレクトリ内ファイル名の取得

dirName <-"testdata"
files<- list.files(dirName)
files

相対パスの取得

filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir

ファイルの読み込み

test_tests
                                   testdata/article1 
"TOKYO An art deco building in the Japanese capital" 
                                   testdata/article2 
                "NAGOYA Kyodo A team of researchers" 
                                   testdata/article3 
  "TOKYO Kyodo Former Japan striker Kazuyoshi Miura" 

cleanNLPパッケージによる文書頻度行列の作成

ライブラリの読み込み

library(cleanNLP)

文書頻度行列

cnlp_init_udpipe()

test_res<-cnlp_annotate(input = test_tests)
View(test_res$token)

test_res$token$lemma
 [1] "Tokyo"      "a"          "art"        "deco"       "building"   "in"        
 [7] "the"        "japanese"   "capital"    "NAGOYA"     "Kyodo"      "a"         
[13] "team"       "of"         "researcher" "Tokyo"      "Kyodo"      "former"    
[19] "Japan"      "striker"    "Kazuyoshi"  "Miura"     
test_docMtx <- as.data.frame.matrix(table(test_res$token$lemma, test_res$token$doc_id))
head(test_docMtx)
colnames(test_docMtx) <- files
head(test_docMtx)

TF-IDF

  • テキスト特有の出現単語に対して、重みづけをする
  • テキストに共通する単語に対しては(低い)重み付け

TF-IDF 1

\[w=tf*log(\frac{N}{df}) \]

TF-IDF 2

\[w=tf*(log(\frac{N}{df})+1) \]

Document Frequency

# the document numbers
N<-ncol(test_docMtx)

docFreq<-apply(test_docMtx, 1, function(x) length(x[x>0]))
head(docFreq)

TF-IDF 1

tf_idf1 <- test_docMtx*log(N/docFreq)
head(tf_idf1)

TF-IDF 2

tf_idf2 <- test_docMtx*(log(N/docFreq)+1)
head(tf_idf2)

オンライン記事から情報を取得 (Ref. Lec02)

ライブラリの読み込み

library(httr)
library(rvest)

自作関数

getArticleContent <- function(url){
  response <- GET(url, user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15"))
  page <- read_html(response)
  article_content <- html_text(html_nodes(page, "p.txt"), trim = TRUE)
  cleaned_content <- trimws(article_content)
  cleaned_content <- paste(cleaned_content, collapse = "")
}

記事データの取得

article_urls <- c()
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c")
#article_urls[[1]] <- "https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c"

article_urls 
length(article_urls)
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241110/p2g/00m/0li/028000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241111/p2g/00m/0na/048000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241112/p2g/00m/0sp/005000c")
length(article_urls)

substring of a content

content1 <- getArticleContent(article_urls[1])
substring(content1, 1, 100)
contents <- lapply(article_urls, getArticleContent)
for(txt in contents)  print(substring(txt, 1, 80))
length(contents)

Tokenization (形態素解析)

#cnlp_init_udpipe()
res<-cnlp_annotate(input = contents)
dim(res$token)

記号・数字を除去

res <- res$token[!res$token$upos %in% c("PUNCT","SYM","NUM"),]
dim(res)
View(res)

条件一致による列抽出

head(res[,colnames(res)=="lemma"])

条件一致による行抽出

res[res$lemma=="'s",]

文書頻度行列

docMtx <- as.data.frame.matrix(table(res$lemma, res$doc_id))
head(docMtx)

列名の変更

colnames(docMtx) <- sapply(seq(1:4), function(x) paste0("article", x))

Tokens

colSums(docMtx) #Tokens
article1 article2 article3 article4 
     256      248      148      134 

Types

length(docMtx$article1[docMtx$article1>0])
[1] 147

Types

length(docMtx$article1[docMtx$article1>0])
[1] 147

Document Freqency

N<-ncol(docMtx)

docFreq<-apply(docMtx, 1, function(x) length(x[x>0]))
head(docFreq)
   's  40th     a about above   add 
    3     1     4     1     1     1 

tf_idf1

tf_idf1 <- docMtx*log(N/docFreq)
head(tf_idf1)

tf_idf1

tf_idf2 <- docMtx*(log(N/docFreq)+1)
head(tf_idf2)

練習:Apply関数を活用して、上のdocMtxの素頻度文書行列を相対頻度文書行列に変換し、TF-IDF1を計算してください

相対頻度文書行列の一部表示

        article1    article2   article3    article4
's    0.02343750 0.004032258 0.00000000 0.007462687
40th  0.00000000 0.000000000 0.00000000 0.007462687
a     0.03125000 0.040322581 0.02027027 0.014925373
about 0.00390625 0.000000000 0.00000000 0.000000000
above 0.00390625 0.000000000 0.00000000 0.000000000
add   0.00390625 0.000000000 0.00000000 0.000000000

tf_idf1計算結果一部表示

         article1    article2 article3    article4
's    0.006742549 0.001160008        0 0.002146881
40th  0.000000000 0.000000000        0 0.010345480
a     0.000000000 0.000000000        0 0.000000000
about 0.005415212 0.000000000        0 0.000000000
above 0.005415212 0.000000000        0 0.000000000
add   0.005415212 0.000000000        0 0.000000000
---
title: "Lec06: TF-IDF"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---
# TF-IDF
+ <a href="https://www.geeksforgeeks.org/an-easy-approach-to-tf-idf-using-r/" target="_blank">tm package</a>
+ <a href="https://saraswatmks.github.io/2020/02/tfidf-matrix-superml-R.html" target="_blank">superml package</a>

## apply関数
+  <a href="https://stats.biopapyrus.jp/r/basic/apply.html" target="_blank">参考資料</a>

### lapply関数
```{r}
data <- c("file1", "file2", "file3")
paste(data, ".txt" , sep="")
paste0(data, ".txt")
lapply(data, paste, ".txt")
```
### sapply関数()
+ 名前の属性付き
```{r}
sapply(data, paste, ".txt")
```
### apply関数
```{r}
test_matrix <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2)
test_matrix

#rowSums(test_matrix)
apply(test_matrix, 1, sum) 

#cowSums(test_matrix)
apply(test_matrix, 2, sum)

apply(test_matrix, c(1,2), function(x) x*10)
```

## テキストファイルの読み込み
### ディレクトリ内ファイル名の取得
```{r}
dirName <-"testdata"
files<- list.files(dirName)
files
```

### 相対パスの取得
```{r}
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
```

###  ファイルの読み込み
```{r}
test_tests <- sapply(filesDir, readLines)
```
## <a href="https://cran.r-project.org/web/packages/cleanNLP/cleanNLP.pdf" target="_blank">cleanNLPパッケージ</a>による文書頻度行列の作成

### ライブラリの読み込み
```{r}
library(cleanNLP)
```

### 文書頻度行列
```{r}
cnlp_init_udpipe()

test_res<-cnlp_annotate(input = test_tests)
#View(test_res$token)

test_res$token$lemma

test_docMtx <- as.data.frame.matrix(table(test_res$token$lemma, test_res$token$doc_id))
head(test_docMtx)
colnames(test_docMtx) <- files
head(test_docMtx)
```

## TF-IDF
+ テキスト特有の出現単語に対して、重みづけをする
+ テキストに共通する単語に対しては（低い）重み付け

### TF-IDF 1
$$w=tf*log(\frac{N}{df}) $$

### TF-IDF 2
$$w=tf*(log(\frac{N}{df})+1) $$

### Document Frequency
```{r}
# the document numbers
N<-ncol(test_docMtx)

docFreq<-apply(test_docMtx, 1, function(x) length(x[x>0]))
head(docFreq)
```

### TF-IDF 1
```{r}
tf_idf1 <- test_docMtx*log(N/docFreq)
head(tf_idf1)
```

### TF-IDF 2
```{r}
tf_idf2 <- test_docMtx*(log(N/docFreq)+1)
head(tf_idf2)
```

## オンライン記事から情報を取得 (Ref. Lec02)
### ライブラリの読み込み
```{r}
library(httr)
library(rvest)
```

### 自作関数
```{r}
getArticleContent <- function(url){
  response <- GET(url, user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15"))
  page <- read_html(response)
  article_content <- html_text(html_nodes(page, "p.txt"), trim = TRUE)
  cleaned_content <- trimws(article_content)
  cleaned_content <- paste(cleaned_content, collapse = "")
}
```

### ニュース記事
* <a href="https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c" target="_blank">Retro Japan: Tokyo textbook library designed in art deco style stores historical materials</a>
* <a href="https://mainichi.jp/english/articles/20241110/p2g/00m/0li/028000c" target="_blank">Japan researchers to see if skin vibration boosts mental health</a>
* <a href="https://mainichi.jp/english/articles/20241111/p2g/00m/0na/048000c" target="_blank">Ishiba suspected of falling asleep during Diet session to select PM</a>
* <a href="https://mainichi.jp/english/articles/20241112/p2g/00m/0sp/005000c" target="_blank">Football: Kazuyoshi Miura, 57, set to play 40th season as professional</a>

### 記事データの取得
```{r}
article_urls <- c()
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c")
#article_urls[[1]] <- "https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c"

article_urls 
length(article_urls)
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241110/p2g/00m/0li/028000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241111/p2g/00m/0na/048000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241112/p2g/00m/0sp/005000c")
length(article_urls)
```

### substring of a content
```{r}
content1 <- getArticleContent(article_urls[1])
substring(content1, 1, 100)
```

```{r}
contents <- lapply(article_urls, getArticleContent)
for(txt in contents)  print(substring(txt, 1, 80))
length(contents)
```

### Tokenization (形態素解析)
```{r}
#cnlp_init_udpipe()
res<-cnlp_annotate(input = contents)
dim(res$token)
```

### 記号・数字を除去
```{r}
res <- res$token[!res$token$upos %in% c("PUNCT","SYM","NUM"),]
dim(res)
View(res)
```

### 条件一致による列抽出
```{r}
head(res[,colnames(res)=="lemma"])
```

### 条件一致による行抽出
```{r}
res[res$lemma=="'s",]
```

### 文書頻度行列
```{r}
docMtx <- as.data.frame.matrix(table(res$lemma, res$doc_id))
head(docMtx)
```

### 列名の変更
```{r}
colnames(docMtx) <- sapply(seq(1:4), function(x) paste0("article", x))
```

### Tokens
```{r}
colSums(docMtx) #Tokens
```

### Types
```{r}
length(docMtx$article1[docMtx$article1>0])
```

### Types
```{r}
length(docMtx$article1[docMtx$article1>0])
```
### Document Freqency
```{r}
N<-ncol(docMtx)

docFreq<-apply(docMtx, 1, function(x) length(x[x>0]))
head(docFreq)
```

### tf_idf1
```{r}
tf_idf1 <- docMtx*log(N/docFreq)
head(tf_idf1)
```

### tf_idf1
```{r}
tf_idf2 <- docMtx*(log(N/docFreq)+1)
head(tf_idf2)
```

## <span style="color: blue; ">練習</span>:Apply関数を活用して、上のdocMtxの素頻度文書行列を相対頻度文書行列に変換し、TF-IDF1を計算してください

```{r, echo=FALSE}
N<-ncol(docMtx)
docFreq<-apply(docMtx, 1, function(x) length(x[x>0]))

relative_docMtx <- apply(docMtx, 2, function(x) x/sum(x))
relative_tf_idf1 <- relative_docMtx*log(N/docFreq)
```
### 相対頻度文書行列の一部表示
```{r, echo=FALSE}
head(relative_docMtx)
```

### tf_idf1計算結果一部表示
```{r, echo=FALSE}
head(relative_tf_idf1)
```


