Lecture7: Collocation

ライブラリの読み込み

library(httr)
library(rvest)
library(cleanNLP)

文書-単語行列の作成 (Ref. Lec06)

オンライン記事情報を取得

自作関数

getArticleContent <- function(url){
  response <- GET(url, user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15"))
  page <- read_html(response)
  article_content <- html_text(html_nodes(page, "p.txt"), trim = TRUE)
  cleaned_content <- trimws(article_content)
  cleaned_content <- paste(cleaned_content, collapse = "")
}

記事データの取得

article_urls <- c()
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c")

article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241110/p2g/00m/0li/028000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241111/p2g/00m/0na/048000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241112/p2g/00m/0sp/005000c")
length(article_urls)
[1] 4

文書単語行列

#Tokenization (形態素解析)
contents <- lapply(article_urls, getArticleContent)
annotedData<-cnlp_annotate(input = contents)$token
dim(annotedData)
[1] 909  11
### 文書単語行列
docMtx <- as.data.frame.matrix(table(annotedData$lemma, annotedData$doc_id))
View(docMtx)

特定の列情報をリスト型として抽出

res1<-annotedData$token
res1[1:10]
 [1] "TOKYO"    "--"       "An"       "art"      "deco"     "building" "in"       "the"     
 [9] "Japanese" "capital" 
res2<-annotedData[[4]]
res2[1:10]
 [1] "TOKYO"    "--"       "An"       "art"      "deco"     "building" "in"       "the"     
 [9] "Japanese" "capital" 

特定の列情報をdataframe型として抽出

res3<-annotedData[4]
res3[1:10,]

リスト内の要素の位置(index)を特定

colnames(annotedData)
 [1] "doc_id"        "sid"           "tid"           "token"         "token_with_ws" "lemma"        
 [7] "upos"          "xpos"          "feats"         "tid_source"    "relation"     
which(colnames(annotedData)=="token")
[1] 4

練習1:annotedDataからArticle1のtoken列を抽出してください

結果出力

article1[1:10]
 [1] "TOKYO"    "--"       "An"       "art"      "deco"     "building" "in"       "the"     
 [9] "Japanese" "capital" 

出現頻度表

freq_data<-sort(table(article1), decreasing=TRUE)
freq_data[1:20]
article1
     the        .        ,       of        a       in       's        *       as      and      was 
      17       12        9        8        7        7        6        6        6        5        5 
building      The     with        -        (        ) facility       is    Japan 
       4        4        4        3        3        3        3        3        3 

Collocation

中心語(node)の検索

部分一致(検索語を変数に格納)

node <- "article"
grep(node, article1, value=T)
[1] "article"  "articles"

完全一致

(nodeLst <- grep("^article$",article1, value=T))
[1] "article"

完全一致(検索語を変数に格納)

node <- "article"
paste0("^", node,"$")
[1] "^article$"

中心語(node)の出現位置検索

node <- "facility"
search_node <- paste0("^", node,"$")
(nodeIndex <- grep(search_node,article1, ignore.case = T))
[1]  97 164 251

周辺語の抽出

  • span=2 (中心語の左右2語)
Left1 <- article1[nodeIndex-1]
Left2 <- article1[nodeIndex-2]
Right1 <- article1[nodeIndex+1]
Right2 <- article1[nodeIndex+2]

collocationの列結合

cbind(Left2, Left1, node, Right1, Right2)
     Left2    Left1 node       Right1 Right2    
[1,] "."      "The" "facility" "has"  "a"       
[2,] "At"     "the" "facility" "'s"   "entrance"
[3,] "inside" "the" "facility" "."    "("       

data.frame: コンコーダンス(Concordance)

collo <- data.frame(cbind(Left2, Left1, node, Right1, Right2))
colnames(collo) <- c("2L","1L","node","1R","2R")
rownames(collo) <- seq(dim(collo)[1])
collo

Specify a variable span size

size <- 4

colloLst <- c()
len<-length(article1)-size+1

for(i in nodeIndex) {
  colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
}

colloLst <- data.frame(colloLst)
colnames(colloLst) <- c(paste0(seq(size, 1, -1),"L"),"node",paste0(seq(1,size),"R"))
rownames(colloLst) <- seq(dim(colloLst)[1])
colloLst

Specify a variable span size (code breakdown)

size <- 4
node <- "facility"
search_node <- paste0("^", node,"$")
(nodeIndex <- grep(search_node,article1, ignore.case = T))
[1]  97 164 251
colloLst <- c()
#nodeIndex[1]
(i=nodeIndex[1])
[1] 97
article1[(i-size):(i+size)]
[1] "Shoseki"    "Co"         "."          "The"        "facility"   "has"       
[7] "a"          "large"      "collection"
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
     [,1]      [,2] [,3] [,4]  [,5]       [,6]  [,7] [,8]    [,9]        
[1,] "Shoseki" "Co" "."  "The" "facility" "has" "a"  "large" "collection"
#nodeIndex[2]
(i=nodeIndex[2])
[1] 164
article1[(i-size):(i+size)]
[1] "geometry" "."        "At"       "the"      "facility" "'s"       "entrance"
[8] ","        "two"     
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
     [,1]       [,2] [,3] [,4]  [,5]       [,6]  [,7]       [,8]    [,9]        
[1,] "Shoseki"  "Co" "."  "The" "facility" "has" "a"        "large" "collection"
[2,] "geometry" "."  "At" "the" "facility" "'s"  "entrance" ","     "two"       
#nodeIndex[3]
(i=nodeIndex[3])
[1] 251
article1[(i-size):(i+size)]
[1] "or"       "seeing"   "inside"   "the"      "facility" "."        "("       
[8] "Japanese" "original"
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
     [,1]       [,2]     [,3]     [,4]  [,5]       [,6]  [,7]       [,8]      
[1,] "Shoseki"  "Co"     "."      "The" "facility" "has" "a"        "large"   
[2,] "geometry" "."      "At"     "the" "facility" "'s"  "entrance" ","       
[3,] "or"       "seeing" "inside" "the" "facility" "."   "("        "Japanese"
     [,9]        
[1,] "collection"
[2,] "two"       
[3,] "original"  

課題2(締め切り12月3日)

annotedDataを使用して、次の3つの変数を引数とし、共起情報をコンコーダンス表示で出力する関数を作成してください

引数

  • 記事
  • キーワード(自分で2-3個選んでください)
  • スパンの長さ #### Option: もしキーワードがヒットしなかった場合の処理についても考えてみてください(実行例3を参照)
node_list = c("of", "art", "Japan")
source("getCollo.R")

実行例1

getCollo(articleLst[[2]], current_node=node_list[1])
  3L        2L    1L            node 1R            2R       3R         
1 "--"      "A"   "team"        "of" "researchers" ","      "including"
2 "explore" "the" "possibility" "of" "("           "health" ")"        
3 "after"   "a"   "month"       "of" "treatment"   "with"   "low"      

実行例2

getCollo(articleLst[[4]], current_node=node_list[3], span=5)
  5L                4L         3L           2L          1L       node    1R         2R         
1 "("               "Kyodo"    ")"          "--"        "Former" "Japan" "striker"  "Kazuyoshi"
2 "for"             "Atletico" "Suzuka"     "in"        "the"    "Japan" "Football" "League"   
3 "Oliveirense.The" "Shizuoka" "Prefecture" "native"    "left"   "Japan" "at"       "15"       
4 "1986"            "."        "After"      "returning" "to"     "Japan" "in"       "1990"     
  3R      4R       5R      
1 "Miura" "said"   "Monday"
2 "next"  "year"   "in"    
3 "for"   "Brazil" ","     
4 ","     "Miura"  "won"   

実行例3

getCollo(articleLst[[4]], current_node=node_list[2])
[1] "The search term does not appear in this article"
NULL
---
title: "Lec07: Collocation"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

# Lecture7: Collocation

### ライブラリの読み込み
```{r}
library(httr)
library(rvest)
library(cleanNLP)
```

## 文書-単語行列の作成 (Ref. Lec06)
### オンライン記事情報を取得

### 自作関数
```{r}
getArticleContent <- function(url){
  response <- GET(url, user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15"))
  page <- read_html(response)
  article_content <- html_text(html_nodes(page, "p.txt"), trim = TRUE)
  cleaned_content <- trimws(article_content)
  cleaned_content <- paste(cleaned_content, collapse = "")
}
```

### ニュース記事
* <a href="https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c" target="_blank">Retro Japan: Tokyo textbook library designed in art deco style stores historical materials</a>
* <a href="https://mainichi.jp/english/articles/20241110/p2g/00m/0li/028000c" target="_blank">Japan researchers to see if skin vibration boosts mental health</a>
* <a href="https://mainichi.jp/english/articles/20241111/p2g/00m/0na/048000c" target="_blank">Ishiba suspected of falling asleep during Diet session to select PM</a>
* <a href="https://mainichi.jp/english/articles/20241112/p2g/00m/0sp/005000c" target="_blank">Football: Kazuyoshi Miura, 57, set to play 40th season as professional</a>

### 記事データの取得
```{r}
article_urls <- c()
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241107/p2a/00m/0et/015000c")

article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241110/p2g/00m/0li/028000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241111/p2g/00m/0na/048000c")
article_urls <- append(article_urls,"https://mainichi.jp/english/articles/20241112/p2g/00m/0sp/005000c")
length(article_urls)
```
### 文書単語行列
```{r}
#Tokenization (形態素解析)
contents <- lapply(article_urls, getArticleContent)
annotedData<-cnlp_annotate(input = contents)$token
dim(annotedData)

### 文書単語行列
docMtx <- as.data.frame.matrix(table(annotedData$lemma, annotedData$doc_id))
View(docMtx)
```
### 特定の列情報をリスト型として抽出
* <a href="https://dataanalytics.org.uk/r-object-elements-brackets-double-brackets-and/" target="_blank">Double brackets</a>
```{r}
res1<-annotedData$token
res1[1:10]
res2<-annotedData[[4]]
res2[1:10]
```

### 特定の列情報をdataframe型として抽出
```{r}
res3<-annotedData[4]
res3[1:10,]
```

### リスト内の要素の位置(index)を特定
```{r}
colnames(annotedData)
which(colnames(annotedData)=="token")
```

### <span style="color: blue; ">練習1</span>:annotedDataからArticle1のtoken列を抽出してください
```{r, echo=false}
colname<-"token"
article1 <- annotedData[annotedData$doc_id==1,][[which(colnames(annotedData)==colname)]]
```
#### 結果出力 
```{r}
article1[1:10]
```
### 出現頻度表
```{r}
freq_data<-sort(table(article1), decreasing=TRUE)
freq_data[1:20]
```

# Collocation
## 中心語(node)の検索
* ignore.case: 大文字・小文字の区別
* 文字検索: <a href="https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/grep" target="_blank">grep</a>
* <a href="https://stats.biopapyrus.jp/r/devel/regex.html" target="_blank">grep使用例</a>

### 部分一致（検索語を変数に格納）
```{r}
node <- "article"
grep(node, article1, value=T)
```

### 完全一致
```{r}
(nodeLst <- grep("^article$",article1, value=T))
```

### 完全一致（検索語を変数に格納）
```{r}
node <- "article"
paste0("^", node,"$")
```

## 中心語(node)の出現位置検索
```{r}
node <- "facility"
search_node <- paste0("^", node,"$")
(nodeIndex <- grep(search_node,article1, ignore.case = T))
```
## 周辺語の抽出
* span=2 (中心語の左右２語)
```{r}
Left1 <- article1[nodeIndex-1]
Left2 <- article1[nodeIndex-2]
Right1 <- article1[nodeIndex+1]
Right2 <- article1[nodeIndex+2]
```

### collocationの列結合
```{r}
cbind(Left2, Left1, node, Right1, Right2)
```
### data.frame: コンコーダンス(Concordance)
```{r}
collo <- data.frame(cbind(Left2, Left1, node, Right1, Right2))
colnames(collo) <- c("2L","1L","node","1R","2R")
rownames(collo) <- seq(dim(collo)[1])
collo
```
### Specify a variable span size
```{r}
size <- 4

node <- "facility"
search_node <- paste0("^", node,"$")
nodeIndex <- grep(search_node,article1, ignore.case = T)

colloLst <- c()
len<-length(article1)-size+1

for(i in nodeIndex) {
  colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
}

colloLst <- data.frame(colloLst)
colnames(colloLst) <- c(paste0(seq(size, 1, -1),"L"),"node",paste0(seq(1,size),"R"))
rownames(colloLst) <- seq(dim(colloLst)[1])
colloLst
```
### Specify a variable span size (code breakdown)
```{r}
size <- 4
node <- "facility"
search_node <- paste0("^", node,"$")
(nodeIndex <- grep(search_node,article1, ignore.case = T))

colloLst <- c()
#nodeIndex[1]
(i=nodeIndex[1])
article1[(i-size):(i+size)]
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
#nodeIndex[2]
(i=nodeIndex[2])
article1[(i-size):(i+size)]
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
#nodeIndex[3]
(i=nodeIndex[3])
article1[(i-size):(i+size)]
colloLst<-rbind(colloLst,article1[(i-size):(i+size)])
colloLst
```

## 課題２（締め切り12月3日）
### annotedDataを使用して、次の３つの変数を引数とし、共起情報をコンコーダンス表示で出力する関数を作成してください
#### 引数
- 記事
- キーワード（自分で2-3個選んでください）
- スパンの長さ
#### Option: もしキーワードがヒットしなかった場合の処理についても考えてみてください（実行例3を参照）
```{r}
node_list = c("of", "art", "Japan")
source("getCollo.R")
```

### 実行例1
```{r}
getCollo(articleLst[[2]], current_node=node_list[1])
```
### 実行例2
```{r}
getCollo(articleLst[[4]], current_node=node_list[3], span=5)
```

### 実行例3
```{r}
getCollo(articleLst[[4]], current_node=node_list[2])
```
