一行ずつ読み込んで、リストに格納
txt1<-readLines("rits-sjis.txt")
wordL1<-strsplit(txt1,"[[:space:]]|[[:punct:]]")
! " # $ % & ’ ( ) * + , - . / : ; < = > ? @ [ ] ^ _ ` { | } ~.
wordL1<-unlist(wordL1)
wordL1<-tolower(wordL1)
wordL1<- wordL1[wordL1 != ""]
freqL1<-sort(table(wordL1), decreasing=TRUE)
freqL1[1:5]
## wordL1
## the of and to in
## 28 16 12 11 9
txt2<-readLines("osaka-u.txt")
freqL2[1:5]
## wordL2
## the and of university osaka
## 41 33 30 28 23
freqL1 <- data.frame(word = rownames(freqL1), freq = freqL1)
freqL2 <- data.frame(word = rownames(freqL2), freq = freqL2)
freqMtx <- merge(freqL1, freqL2, all = T, by = "word")
head(freqMtx)
## word freq.x freq.y
## 1 21st 1 1
## 2 a 4 15
## 3 achievements 1 NA
## 4 acquire 1 NA
## 5 actively 1 NA
## 6 activities 2 NA
freqMtx[is.na(freqMtx)] <- 0
head(freqMtx)
## word freq.x freq.y
## 1 21st 1 1
## 2 a 4 15
## 3 achievements 1 0
## 4 acquire 1 0
## 5 actively 1 0
## 6 activities 2 0
row.names(freqMtx) <- freqMtx[, 1]
head(freqMtx)
## word freq.x freq.y
## 21st 21st 1 1
## a a 4 15
## achievements achievements 1 0
## acquire acquire 1 0
## actively actively 1 0
## activities activities 2 0
freqMtx <- freqMtx[-1]
head(freqMtx)
## freq.x freq.y
## 21st 1 1
## a 4 15
## achievements 1 0
## acquire 1 0
## actively 1 0
## activities 2 0
colnames(freqMtx) <- c("rits", "handai")
head(freqMtx)
## rits handai
## 21st 1 1
## a 4 15
## achievements 1 0
## acquire 1 0
## actively 1 0
## activities 2 0
sum(freqMtx$rits)
## [1] 353
sum(freqMtx[1])
## [1] 353
length(freqMtx$rits[freqMtx$rits>0])
## [1] 188
## [1] 689
## [1] 305
apply(freqMtx,2,sum)
## rits handai
## 353 689
apply(freqMtx,2,function(x) length(x[x>0]))
## rits handai
## 188 305
## rits handai
## 53.25779 44.26705
title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
f1<-sort(freqMtx$rits,decreasing=TRUE)
plot(f1, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 240 y values <= 0 omitted
## from logarithmic plot
K=30
A=0.75
rank=seq(1,length(freqL))
rank
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## [137] 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
## [154] 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
## [171] 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
## [188] 188
zipf = unlist(lapply(rank, function(r) K/r^A))
title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(freqL, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100))
text(2,100,paste("K=",K))
text(2,80,paste("A=",A))
library(manipulate)
title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
manipulate({
f1<-sort(freqMtx$rits,decreasing=TRUE)
plot(f1, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
zipf = unlist(lapply(rank, function(r) K/r^A))
par(new=T)
plot(zipf, log="xy", type="l",col="red" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
text(2,100,paste("K=",K))
text(2,80,paste("A=",A))
},
K=slider(10,100,initial=30),
A=slider(0.5,1.5,initial=1.0,step=0.1)
)
freqMtx[freqMtx$rits>0,]
## rits handai
## 21st 1 1
## a 4 15
## achievements 1 0
## acquire 1 0
## actively 1 0
## activities 2 0
## allowed 1 0
## also 3 0
## although 1 0
## and 12 33
## are 4 2
## as 2 13
## at 4 1
## aware 1 0
## based 1 0
## believe 1 1
## burdens 1 0
## but 1 0
## by 1 5
## can 1 5
## capable 2 1
## centers 2 0
## century 1 3
## challenges 2 1
## changes 2 0
## changing 1 0
## class 1 0
## classroom 1 0
## co 2 0
## coming 1 0
## competition 1 0
## conducted 1 0
## contribute 2 0
## creative 1 1
## critical 1 0
## cultivated 1 0
## current 2 0
## cutting 2 0
## developing 1 0
## development 2 2
## develops 1 0
## edge 2 0
## education 2 5
## efforts 1 0
## encourage 2 0
## engaged 1 0
## environment 3 1
## establishment 2 0
## exist 1 0
## existing 1 0
## expose 1 0
## facilitated 1 0
## facing 1 1
## focus 1 1
## for 3 10
## founding 1 0
## freedom 1 0
## from 1 0
## fulfill 1 0
## further 1 0
## future 1 3
## gain 1 0
## generation 1 1
## giro 1 0
## global 2 0
## globalization 1 0
## has 2 3
## have 1 1
## high 1 0
## highest 1 0
## hope 1 0
## however 1 1
## human 2 1
## humanities 1 1
## humankind 1 0
## humans 1 0
## i 1 4
## ideals 1 0
## importance 1 0
## important 1 0
## improve 1 0
## improving 1 0
## in 9 12
## including 1 0
## individuals 4 0
## innovation 2 0
## institutes 1 0
## international 1 0
## into 1 2
## involve 1 0
## is 3 2
## it 3 2
## its 2 4
## keep 1 0
## laboratories 1 0
## learning 1 4
## level 1 0
## levels 1 0
## many 1 1
## match 1 0
## mind 1 0
## modern 1 3
## must 3 0
## my 1 0
## natural 2 0
## nature 2 0
## necessary 1 0
## needs 2 0
## new 1 0
## not 1 0
## obligated 1 0
## obtained 1 0
## occur 1 0
## of 16 30
## on 3 8
## only 1 1
## order 2 0
## organization 1 1
## our 2 5
## out 1 0
## over 1 0
## overcome 1 0
## overseas 1 0
## peacefully 1 0
## per 1 0
## potential 1 1
## powerful 1 0
## private 1 1
## produce 2 0
## produced 1 1
## provide 1 2
## r 1 0
## real 1 0
## realization 1 0
## recognize 1 0
## remaining 1 0
## requirements 1 0
## research 7 4
## resolving 1 0
## resources 2 1
## responsible 1 0
## ritsumeikan 5 0
## s 1 6
## same 1 0
## school 1 1
## science 2 1
## sciences 2 2
## sent 1 0
## since 1 0
## sincere 1 0
## skills 1 0
## so 1 0
## social 1 0
## society 3 4
## standards 1 0
## students 4 0
## successfully 1 0
## such 2 1
## surmount 1 0
## surrounding 1 0
## synonymous 1 0
## take 1 0
## teach 1 0
## technology 2 0
## that 7 9
## the 28 41
## themselves 1 0
## these 1 2
## they 2 0
## this 1 2
## those 1 0
## through 1 1
## tide 1 0
## time 1 1
## to 11 23
## tough 1 0
## university 3 28
## up 1 2
## values 1 0
## various 2 0
## we 5 4
## wealth 1 0
## where 1 0
## while 1 0
## will 2 3
## with 4 3
## world 4 2
## years 2 0
words<- rownames(freqMtx[freqMtx$rits>0,])
head(words)
## [1] "21st" "a" "achievements" "acquire"
## [5] "actively" "activities"
nchar(words[1])
## [1] 4
nchar関数:文字数(エクセルのlen関数と同じ)
wdLen <- unlist(lapply(words,nchar))
wdLenFreq <- table(wdLen)
wdLenFreq
## wdLen
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 4 14 13 28 20 15 28 18 16 15 9 5 3
title="Word Length Frequency Distribution (Types)"
xlabel="Word Length"
ylabel="Frequency"
xmax=length(wdLenFreq)
plot(wdLenFreq1, type="b",pch=8,col="orange" , xlim=c(1,xmax),ylim=c(1,30),main=title, xlab=xlabel, ylab=ylabel)
words2<- rownames(freqMtx[freqMtx$handai>0,])
## wdLen
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 4 18 25 49 35 31 42 36 22 16 17 4 6
title="Word Length Frequency Distribution (Types)"
xlabel="Word Length"
ylabel="Frequency"
xmax=length(wdLenFreq)
ymax=max(wdLenFreq)
plot(wdLenFreq, type="b",pch=8,col="green" , xlim=c(1,xmax),ylim=c(1,ymax),main=title, xlab=xlabel, ylab=ylabel)
freqL<-freqMtx$rits[freqMtx$rits>0]
mFreq <- table(freqL)
tokens<-sum(freqL)
names(mFreq[3])
## [1] "3"
mFreq[3]
## 3
## 9
as.numeric(names(mFreq[3]))*mFreq[3]
## 3
## 27
as.numeric(names(mFreq[3]))*mFreq[3]
## 3
## 27
m2 <- mapply( function(x,y) as.numeric(x)^2*y,names(mFreq),mFreq)
K <- 10000*(sum(m2)-tokens)/tokens^2
K
## [1] 131.7722