Lecture7: Network representation, TwitteR

文字単位のn-gram:英文

test1<-"Osaka University said Wednesday that personal data of around 80,000 students"
substr(test1,1,4)

## [1] "Osak"

size=4
len<-nchar(test1)-size+1
ngramLst <- c()
for(i in 1:nchar(test1)){
  ngramLst<-rbind(ngramLst,(substr(test1,i,i+size-1)))
}

head(ngramLst)

##      [,1]  
## [1,] "Osak"
## [2,] "saka"
## [3,] "aka "
## [4,] "ka U"
## [5,] "a Un"
## [6,] " Uni"

文字単位のn-gram:日本語

準備

test2<-"大阪大学は１３日、学内システムに海外から不正アクセスがあり、"
substr(test2,1,4)

## [1] "大阪大学"

nchar(test2)

## [1] 30

文字単位のngram

size=4
len<-nchar(test2)-size+1
ngramLst <- c()
for(i in 1:len){
 ngramLst<-rbind(ngramLst,(substr(test2,i,i+size-1)))
}

head(ngramLst)

##      [,1]      
## [1,] "大阪大学"
## [2,] "阪大学は"
## [3,] "大学は１"
## [4,] "学は１３"
## [5,] "は１３日"
## [6,] "１３日、"

単語単位のn-gram

準備：単語単位のリスト作成

wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
wordLst

##  [1] "osaka"      "university" "said"       "wednesday"  "that"      
##  [6] "personal"   "data"       "of"         "around"     "80"        
## [11] "000"        "students"

単語単位のngram

size=3
start=1
wordLst[start:(start+size-1)]

## [1] "osaka"      "university" "said"

strLst<-c()
len<-length(wordLst)-size+1
for(i in seq(1,len,size)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##      [,1]        [,2]         [,3]      
## [1,] "osaka"     "university" "said"    
## [2,] "wednesday" "that"       "personal"
## [3,] "data"      "of"         "around"  
## [4,] "80"        "000"        "students"

RMecabによる日本語形態素解析

library("RMeCab")
wordLst<-unlist(RMeCabC(test2))
wordLst

##       名詞       助詞       名詞       名詞       名詞       記号 
## "大阪大学"       "は"       "１"       "３"       "日"       "、" 
##       名詞       名詞       助詞       名詞       助詞       名詞 
##     "学内" "システム"       "に"     "海外"     "から"     "不正" 
##       名詞       助詞       動詞       記号 
## "アクセス"       "が"     "あり"       "、"

記号を除除

wordLst <- wordLst[names(wordLst)!="記号"]
wordLst

##       名詞       助詞       名詞       名詞       名詞       名詞 
## "大阪大学"       "は"       "１"       "３"       "日"     "学内" 
##       名詞       助詞       名詞       助詞       名詞       名詞 
## "システム"       "に"     "海外"     "から"     "不正" "アクセス" 
##       助詞       動詞 
##       "が"     "あり"

単語単位のngram

size=3
len<-length(wordLst)-size+1
strLst<-c()
for(i in seq(1,len,size)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##      名詞       助詞   名詞      
## [1,] "大阪大学" "は"   "１"      
## [2,] "３"       "日"   "学内"    
## [3,] "システム" "に"   "海外"    
## [4,] "から"     "不正" "アクセス"

ネットワーク描画

使用データ

wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
wordLst

##  [1] "osaka"      "university" "said"       "wednesday"  "that"      
##  [6] "personal"   "data"       "of"         "around"     "80"        
## [11] "000"        "students"

ngramによる区分: ngram=3, step=2

size=3
step=2
strLst<-c()
len<-length(wordLst)-size+1
for(i in seq(1,len,step)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##      [,1]     [,2]         [,3]    
## [1,] "osaka"  "university" "said"  
## [2,] "said"   "wednesday"  "that"  
## [3,] "that"   "personal"   "data"  
## [4,] "data"   "of"         "around"
## [5,] "around" "80"         "000"

ペアデータの作成

一行のデータ

str<-strLst[1,]

pLst <- c()
for(i in 1:(length(str)-1)){
    for(j in (i+1):length(str)){
      tmp<-cbind(str[i],str[j])
      pLst<-rbind(pLst,tmp)
    }
  }
pLst

##      [,1]         [,2]        
## [1,] "osaka"      "university"
## [2,] "osaka"      "said"      
## [3,] "university" "said"

netwkPairs.R（ペア単語取得用関数）のロード

source("netwkPairs.R")

ペアデータの取得

getPairs(strLst[1,])

##      [,1]         [,2]        
## [1,] "osaka"      "university"
## [2,] "osaka"      "said"      
## [3,] "university" "said"

getPairs(strLst[2,])

##      [,1]        [,2]       
## [1,] "said"      "wednesday"
## [2,] "said"      "that"     
## [3,] "wednesday" "that"

getPairsLst(strLst)

##       [,1]         [,2]        
##  [1,] "osaka"      "university"
##  [2,] "osaka"      "said"      
##  [3,] "university" "said"      
##  [4,] "said"       "wednesday" 
##  [5,] "said"       "that"      
##  [6,] "wednesday"  "that"      
##  [7,] "that"       "personal"  
##  [8,] "that"       "data"      
##  [9,] "personal"   "data"      
## [10,] "data"       "of"        
## [11,] "data"       "around"    
## [12,] "of"         "around"    
## [13,] "around"     "80"        
## [14,] "around"     "000"       
## [15,] "80"         "000"

ペア頻度表

pFreq<-getPairsFreq(strLst)
head(pFreq)

##        Term1      Term2 Freq
## 1      osaka university    1
## 2      osaka       said    1
## 3 university       said    1
## 4       said  wednesday    1
## 5       said       that    1
## 6  wednesday       that    1

igraphをインストールする

install.packages("igraph")

igraphを利用した描画（有向グラフ）

library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

wng<-graph.data.frame(pFreq)
plot(wng)

igraphを利用した描画（無向グラフ）

wng<-as.undirected(graph.data.frame(pFreq))
plot(wng)

igraphを利用した描画（エッジ幅とノードの大きさを調整）

wng<-as.undirected(graph.data.frame(pFreq))
E(wng)$weight<-pFreq$Freq
deg<-degree(wng)
plot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

ノードの色, フォントサイズ

wng<-as.undirected(graph.data.frame(pFreq))
E(wng)$weight<-pFreq$Freq
deg<-degree(wng)
V(wng)$color <- "lightblue"
V(wng)$shape <- "rectangle" 
V(wng)$label.cex <- 1.5 
V(wng)$label.color <- "red" 
plot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

tkplot

tkplot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

Twitterアプリケーションの作成＆登録

OAuth 認証用

https://apps.twitter.com/

ROAuth, twitteRのインストール

  install.packages('twitteR')
　install.packages('ROAuth')

ROAuth, twitteRのインストールの読み込み

  library(twitteR)
  library(ROAuth)

twitteRからのOauth認証

cacert.pemをダウンロード

  download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

認証情報(Twitterアプリケーション)

consr_key="***********"
consr_secrt="***********"
req_url ="https://api.twitter.com/oauth/request_token"
acs_url = "https://api.twitter.com/oauth/access_token"
auth_url="https://api.twitter.com/oauth/authorize"

cred<-OAuthFactory$new(consumerKey=consr_key,consumerSecret=consr_secrt,requestURL =req_url,accessURL = acs_url,authURL=auth_url)

handshake: twitterクライアント接続

cred$handshake(cainfo="cacert.pem")

認証情報取得

setup_twitter_oauth(consr_key, consr_secrt, acs_token, acs_token_sec)

Lecture7: Network representation, TwitteR

文字単位のn-gram:英文

文字単位のn-gram:日本語

準備

文字単位のngram

単語単位のn-gram

準備：単語単位のリスト作成

単語単位のngram

RMecabによる日本語形態素解析

記号を除除

単語単位のngram

ネットワーク描画

使用データ

ngramによる区分: ngram=3, step=2

ペアデータの作成

一行のデータ

netwkPairs.R（ペア単語取得用関数）のロード

ペアデータの取得

ペア頻度表

igraphをインストールする

igraphを利用した描画（有向グラフ）

igraphを利用した描画（無向グラフ）

igraphを利用した描画（エッジ幅とノードの大きさを調整）

ノードの色, フォントサイズ

tkplot

Twitterアプリケーションの作成＆登録

OAuth 認証用

ROAuth, twitteRのインストール

ROAuth, twitteRのインストールの読み込み

twitteRからのOauth認証

cacert.pemをダウンロード

認証情報(Twitterアプリケーション)

handshake: twitterクライアント接続

認証情報取得

検索例

今日の課題（締め切り1月9日）：配布テキスト（日本語,英語）を使用して、単語のネットワーク描画アプリケーションをShinyで実装してください。

UI部分には、共起単語情報取得時のngram, stepをインタラクティブに変更できる機能をつけてください。

ネットワーク図の描画はplotを用いてください。