Lecture7: Network representation, TwitteR

文字単位のn-gram:英文

test1<-"Osaka University said Wednesday that personal data of around 80,000 students"
substr(test1,1,4)
## [1] "Osak"
size=4
len<-nchar(test1)-size+1
ngramLst <- c()
for(i in 1:nchar(test1)){
  ngramLst<-rbind(ngramLst,(substr(test1,i,i+size-1)))
}

head(ngramLst)
##      [,1]  
## [1,] "Osak"
## [2,] "saka"
## [3,] "aka "
## [4,] "ka U"
## [5,] "a Un"
## [6,] " Uni"

文字単位のn-gram:日本語

準備

test2<-"大阪大学は13日、学内システムに海外から不正アクセスがあり、"
substr(test2,1,4)
## [1] "大阪大学"
nchar(test2)
## [1] 30

文字単位のngram

size=4
len<-nchar(test2)-size+1
ngramLst <- c()
for(i in 1:len){
 ngramLst<-rbind(ngramLst,(substr(test2,i,i+size-1)))
}

head(ngramLst)
##      [,1]      
## [1,] "大阪大学"
## [2,] "阪大学は"
## [3,] "大学は1"
## [4,] "学は13"
## [5,] "は13日"
## [6,] "13日、"

単語単位のn-gram

準備:単語単位のリスト作成

wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
wordLst
##  [1] "osaka"      "university" "said"       "wednesday"  "that"      
##  [6] "personal"   "data"       "of"         "around"     "80"        
## [11] "000"        "students"

単語単位のngram

size=3
start=1
wordLst[start:(start+size-1)]
## [1] "osaka"      "university" "said"
strLst<-c()
len<-length(wordLst)-size+1
for(i in seq(1,len,size)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst
##      [,1]        [,2]         [,3]      
## [1,] "osaka"     "university" "said"    
## [2,] "wednesday" "that"       "personal"
## [3,] "data"      "of"         "around"  
## [4,] "80"        "000"        "students"

RMecabによる日本語形態素解析

library("RMeCab")
wordLst<-unlist(RMeCabC(test2))
wordLst
##       名詞       助詞       名詞       名詞       名詞       記号 
## "大阪大学"       "は"       "1"       "3"       "日"       "、" 
##       名詞       名詞       助詞       名詞       助詞       名詞 
##     "学内" "システム"       "に"     "海外"     "から"     "不正" 
##       名詞       助詞       動詞       記号 
## "アクセス"       "が"     "あり"       "、"

記号を除除

wordLst <- wordLst[names(wordLst)!="記号"]
wordLst
##       名詞       助詞       名詞       名詞       名詞       名詞 
## "大阪大学"       "は"       "1"       "3"       "日"     "学内" 
##       名詞       助詞       名詞       助詞       名詞       名詞 
## "システム"       "に"     "海外"     "から"     "不正" "アクセス" 
##       助詞       動詞 
##       "が"     "あり"

単語単位のngram

size=3
len<-length(wordLst)-size+1
strLst<-c()
for(i in seq(1,len,size)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst
##      名詞       助詞   名詞      
## [1,] "大阪大学" "は"   "1"      
## [2,] "3"       "日"   "学内"    
## [3,] "システム" "に"   "海外"    
## [4,] "から"     "不正" "アクセス"

ネットワーク描画

使用データ

wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
wordLst
##  [1] "osaka"      "university" "said"       "wednesday"  "that"      
##  [6] "personal"   "data"       "of"         "around"     "80"        
## [11] "000"        "students"

ngramによる区分: ngram=3, step=2

size=3
step=2
strLst<-c()
len<-length(wordLst)-size+1
for(i in seq(1,len,step)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst
##      [,1]     [,2]         [,3]    
## [1,] "osaka"  "university" "said"  
## [2,] "said"   "wednesday"  "that"  
## [3,] "that"   "personal"   "data"  
## [4,] "data"   "of"         "around"
## [5,] "around" "80"         "000"

ペアデータの作成

一行のデータ

str<-strLst[1,]

pLst <- c()
for(i in 1:(length(str)-1)){
    for(j in (i+1):length(str)){
      tmp<-cbind(str[i],str[j])
      pLst<-rbind(pLst,tmp)
    }
  }
pLst
##      [,1]         [,2]        
## [1,] "osaka"      "university"
## [2,] "osaka"      "said"      
## [3,] "university" "said"

netwkPairs.R(ペア単語取得用関数)のロード

source("netwkPairs.R")

ペアデータの取得

getPairs(strLst[1,])
##      [,1]         [,2]        
## [1,] "osaka"      "university"
## [2,] "osaka"      "said"      
## [3,] "university" "said"
getPairs(strLst[2,])
##      [,1]        [,2]       
## [1,] "said"      "wednesday"
## [2,] "said"      "that"     
## [3,] "wednesday" "that"
getPairsLst(strLst)
##       [,1]         [,2]        
##  [1,] "osaka"      "university"
##  [2,] "osaka"      "said"      
##  [3,] "university" "said"      
##  [4,] "said"       "wednesday" 
##  [5,] "said"       "that"      
##  [6,] "wednesday"  "that"      
##  [7,] "that"       "personal"  
##  [8,] "that"       "data"      
##  [9,] "personal"   "data"      
## [10,] "data"       "of"        
## [11,] "data"       "around"    
## [12,] "of"         "around"    
## [13,] "around"     "80"        
## [14,] "around"     "000"       
## [15,] "80"         "000"

ペア頻度表

pFreq<-getPairsFreq(strLst)
head(pFreq)
##        Term1      Term2 Freq
## 1      osaka university    1
## 2      osaka       said    1
## 3 university       said    1
## 4       said  wednesday    1
## 5       said       that    1
## 6  wednesday       that    1

igraphをインストールする

install.packages("igraph")

igraphを利用した描画(有向グラフ)

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
wng<-graph.data.frame(pFreq)
plot(wng)

igraphを利用した描画(無向グラフ)

wng<-as.undirected(graph.data.frame(pFreq))
plot(wng)

igraphを利用した描画(エッジ幅とノードの大きさを調整)

wng<-as.undirected(graph.data.frame(pFreq))
E(wng)$weight<-pFreq$Freq
deg<-degree(wng)
plot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

ノードの色, フォントサイズ

wng<-as.undirected(graph.data.frame(pFreq))
E(wng)$weight<-pFreq$Freq
deg<-degree(wng)
V(wng)$color <- "lightblue"
V(wng)$shape <- "rectangle" 
V(wng)$label.cex <- 1.5 
V(wng)$label.color <- "red" 
plot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

tkplot

tkplot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

Twitterアプリケーションの作成&登録

OAuth 認証用

https://apps.twitter.com/

ROAuth, twitteRのインストール

  install.packages('twitteR')
 install.packages('ROAuth')

ROAuth, twitteRのインストールの読み込み

  library(twitteR)
  library(ROAuth)

twitteRからのOauth認証

cacert.pemをダウンロード

  download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

認証情報(Twitterアプリケーション)

consr_key="***********"
consr_secrt="***********"
req_url ="https://api.twitter.com/oauth/request_token"
acs_url = "https://api.twitter.com/oauth/access_token"
auth_url="https://api.twitter.com/oauth/authorize"
cred<-OAuthFactory$new(consumerKey=consr_key,consumerSecret=consr_secrt,requestURL =req_url,accessURL = acs_url,authURL=auth_url)

handshake: twitterクライアント接続

cred$handshake(cainfo="cacert.pem")

認証情報取得

setup_twitter_oauth(consr_key, consr_secrt, acs_token, acs_token_sec)

検索例

searchTwitter("#DH2018", n=10)
searchTwitter("#永世七冠", n=10)

username = "casualconc"
userTimeline(username)

username = "langstat"
userTimeline(username)

今日の課題(締め切り1月9日):配布テキスト(日本語,英語)を使用して、単語のネットワーク描画アプリケーションをShinyで実装してください。

UI部分には、共起単語情報取得時のngram, stepをインタラクティブに変更できる機能をつけてください。

ネットワーク図の描画はplotを用いてください。