Lecture 7

単語出現頻度表の作成

テキストファイルの読み込み

一行ずつ読み込んで、リストに格納

txt1<-readLines("rits-sjis.txt")

スペース&記号による分割

wordL1<-strsplit(txt1,"[[:space:]]|[[:punct:]]")

Punctuation characters:

! " # $ % & ’ ( ) * + , - . / : ; < = > ? @ [ ] ^ _ ` { | } ~.

単語の頻度数

wordL1<-unlist(wordL1)
wordL1<-tolower(wordL1)
wordL1<- wordL1[wordL1 != ""]
freqL1<-sort(table(wordL1), decreasing=TRUE)

単語の頻度数(上位5語)

freqL1[1:5]

## wordL1
## the  of and  to  in 
##  28  16  12  11   9

txt2<-readLines("osaka-u.txt")

txt2単語の頻度数(上位5語)

freqL2[1:5]

## wordL2
##        the        and         of university      osaka 
##         41         33         30         28         23

txt1とtxt2をデータフレーム型に変更

freqL1 <- data.frame(word = rownames(freqL1), freq = freqL1)
freqL2 <- data.frame(word = rownames(freqL2), freq = freqL2)

txt1とtxt2の頻度表結果を結合(merge)

freqMtx <- merge(freqL1, freqL2, all = T, by = "word")
head(freqMtx)

##           word freq.x freq.y
## 1         21st      1      1
## 2            a      4     15
## 3 achievements      1     NA
## 4      acquire      1     NA
## 5     actively      1     NA
## 6   activities      2     NA

欠損地

freqMtx[is.na(freqMtx)] <- 0
head(freqMtx)

##           word freq.x freq.y
## 1         21st      1      1
## 2            a      4     15
## 3 achievements      1      0
## 4      acquire      1      0
## 5     actively      1      0
## 6   activities      2      0

行名にfreqMtxの１列目を代入

row.names(freqMtx) <- freqMtx[, 1]
head(freqMtx)

##                      word freq.x freq.y
## 21st                 21st      1      1
## a                       a      4     15
## achievements achievements      1      0
## acquire           acquire      1      0
## actively         actively      1      0
## activities     activities      2      0

１列目のデータを削除

freqMtx <- freqMtx[-1]
head(freqMtx)

##              freq.x freq.y
## 21st              1      1
## a                 4     15
## achievements      1      0
## acquire           1      0
## actively          1      0
## activities        2      0

列名を定義

colnames(freqMtx) <- c("rits", "handai")
head(freqMtx)

##              rits handai
## 21st            1      1
## a               4     15
## achievements    1      0
## acquire         1      0
## actively        1      0
## activities      2      0

ritsのtoken数

sum(freqMtx$rits)

## [1] 353

sum(freqMtx[1])

## [1] 353

ritsのtype数

length(freqMtx$rits[freqMtx$rits>0])

## [1] 188

handaiのtoken, type数

## [1] 689

## [1] 305

freqMtx各列のtoken, type数

apply(freqMtx,2,sum)

##   rits handai 
##    353    689

apply(freqMtx,2,function(x) length(x[x>0]))

##   rits handai 
##    188    305

freqMtx各列のTTR

##     rits   handai 
## 53.25779 44.26705

散布図(Plot関数)

頻度散布図

title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
f1<-sort(freqMtx$rits,decreasing=TRUE)
plot(f1, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)

## Warning in xy.coords(x, y, xlabel, ylabel, log): 240 y values <= 0 omitted
## from logarithmic plot

Zipf’sの法則

K=30
A=0.75
rank=seq(1,length(freqL))
rank

##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## [137] 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
## [154] 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
## [171] 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
## [188] 188

zipf = unlist(lapply(rank, function(r) K/r^A))
title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)

頻度散布図＆Zipf’sの理論式の重ね書き

par(new=T) 
plot(freqL, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100))

頻度散布図＆Zipf’sの理論式の重ね書き

text(2,100,paste("K=",K))
text(2,80,paste("A=",A))

manipulate

library(manipulate)

title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"

manipulate({
  f1<-sort(freqMtx$rits,decreasing=TRUE)
  plot(f1, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
  zipf = unlist(lapply(rank, function(r) K/r^A))
  par(new=T) 
  plot(zipf, log="xy", type="l",col="red" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
  text(2,100,paste("K=",K))
  text(2,80,paste("A=",A))
},
  K=slider(10,100,initial=30),
  A=slider(0.5,1.5,initial=1.0,step=0.1)
  )

単語長頻度分布1:Types

Rits出現単語

freqMtx[freqMtx$rits>0,]

##               rits handai
## 21st             1      1
## a                4     15
## achievements     1      0
## acquire          1      0
## actively         1      0
## activities       2      0
## allowed          1      0
## also             3      0
## although         1      0
## and             12     33
## are              4      2
## as               2     13
## at               4      1
## aware            1      0
## based            1      0
## believe          1      1
## burdens          1      0
## but              1      0
## by               1      5
## can              1      5
## capable          2      1
## centers          2      0
## century          1      3
## challenges       2      1
## changes          2      0
## changing         1      0
## class            1      0
## classroom        1      0
## co               2      0
## coming           1      0
## competition      1      0
## conducted        1      0
## contribute       2      0
## creative         1      1
## critical         1      0
## cultivated       1      0
## current          2      0
## cutting          2      0
## developing       1      0
## development      2      2
## develops         1      0
## edge             2      0
## education        2      5
## efforts          1      0
## encourage        2      0
## engaged          1      0
## environment      3      1
## establishment    2      0
## exist            1      0
## existing         1      0
## expose           1      0
## facilitated      1      0
## facing           1      1
## focus            1      1
## for              3     10
## founding         1      0
## freedom          1      0
## from             1      0
## fulfill          1      0
## further          1      0
## future           1      3
## gain             1      0
## generation       1      1
## giro             1      0
## global           2      0
## globalization    1      0
## has              2      3
## have             1      1
## high             1      0
## highest          1      0
## hope             1      0
## however          1      1
## human            2      1
## humanities       1      1
## humankind        1      0
## humans           1      0
## i                1      4
## ideals           1      0
## importance       1      0
## important        1      0
## improve          1      0
## improving        1      0
## in               9     12
## including        1      0
## individuals      4      0
## innovation       2      0
## institutes       1      0
## international    1      0
## into             1      2
## involve          1      0
## is               3      2
## it               3      2
## its              2      4
## keep             1      0
## laboratories     1      0
## learning         1      4
## level            1      0
## levels           1      0
## many             1      1
## match            1      0
## mind             1      0
## modern           1      3
## must             3      0
## my               1      0
## natural          2      0
## nature           2      0
## necessary        1      0
## needs            2      0
## new              1      0
## not              1      0
## obligated        1      0
## obtained         1      0
## occur            1      0
## of              16     30
## on               3      8
## only             1      1
## order            2      0
## organization     1      1
## our              2      5
## out              1      0
## over             1      0
## overcome         1      0
## overseas         1      0
## peacefully       1      0
## per              1      0
## potential        1      1
## powerful         1      0
## private          1      1
## produce          2      0
## produced         1      1
## provide          1      2
## r                1      0
## real             1      0
## realization      1      0
## recognize        1      0
## remaining        1      0
## requirements     1      0
## research         7      4
## resolving        1      0
## resources        2      1
## responsible      1      0
## ritsumeikan      5      0
## s                1      6
## same             1      0
## school           1      1
## science          2      1
## sciences         2      2
## sent             1      0
## since            1      0
## sincere          1      0
## skills           1      0
## so               1      0
## social           1      0
## society          3      4
## standards        1      0
## students         4      0
## successfully     1      0
## such             2      1
## surmount         1      0
## surrounding      1      0
## synonymous       1      0
## take             1      0
## teach            1      0
## technology       2      0
## that             7      9
## the             28     41
## themselves       1      0
## these            1      2
## they             2      0
## this             1      2
## those            1      0
## through          1      1
## tide             1      0
## time             1      1
## to              11     23
## tough            1      0
## university       3     28
## up               1      2
## values           1      0
## various          2      0
## we               5      4
## wealth           1      0
## where            1      0
## while            1      0
## will             2      3
## with             4      3
## world            4      2
## years            2      0

words<- rownames(freqMtx[freqMtx$rits>0,])
head(words)

## [1] "21st"         "a"            "achievements" "acquire"     
## [5] "actively"     "activities"

nchar(words[1])

## [1] 4

単語の文字数計算

nchar関数：文字数（エクセルのlen関数と同じ）

wdLen <- unlist(lapply(words,nchar))

単語文字数の集計

wdLenFreq <- table(wdLen)
wdLenFreq

## wdLen
##  1  2  3  4  5  6  7  8  9 10 11 12 13 
##  4 14 13 28 20 15 28 18 16 15  9  5  3

単語長分布（Types）

title="Word Length Frequency Distribution (Types)"
xlabel="Word Length"
ylabel="Frequency"
xmax=length(wdLenFreq)
plot(wdLenFreq1, type="b",pch=8,col="orange" , xlim=c(1,xmax),ylim=c(1,30),main=title, xlab=xlabel, ylab=ylabel)

Handai:出現単語

words2<- rownames(freqMtx[freqMtx$handai>0,])

単語長集計

## wdLen
##  1  2  3  4  5  6  7  8  9 10 11 12 13 
##  4 18 25 49 35 31 42 36 22 16 17  4  6

単語長分布（Types）

title="Word Length Frequency Distribution (Types)"
xlabel="Word Length"
ylabel="Frequency"
xmax=length(wdLenFreq)
ymax=max(wdLenFreq)
plot(wdLenFreq, type="b",pch=8,col="green" , xlim=c(1,xmax),ylim=c(1,ymax),main=title, xlab=xlabel, ylab=ylabel)

Yule’sのK特性値

頻度スペクトラム

freqL<-freqMtx$rits[freqMtx$rits>0]
mFreq <- table(freqL)

単語のToken数

tokens<-sum(freqL)

頻度パタン(m)とその頻度

names(mFreq[3])

## [1] "3"

mFreq[3]

## 3 
## 9

as.numeric(names(mFreq[3]))*mFreq[3]

##  3 
## 27

Yule’sのK特性値の部分計算

as.numeric(names(mFreq[3]))*mFreq[3]

##  3 
## 27

m2 <- mapply( function(x,y) as.numeric(x)^2*y,names(mFreq),mFreq)

Yule’sのK特性値

K <- 10000*(sum(m2)-tokens)/tokens^2
K

## [1] 131.7722