hoge <- "hogehoge"
n1 <- 5
n2 <- 3
n1 + n2
## [1] 8
n1 - n2
## [1] 2
n1 * n2
## [1] 15
n1 / n2
## [1] 1.666667
test <- c("a", "ab", "abc")
length(test)
## [1] 3
length(test[2])
## [1] 1
nchar(test[3])
## [1] 3
一行ずつ読み込んで、リストに格納
txt<-readLines("rits-sjis.txt")
txt[1]
## [1] "Since its establishment, based on the founding ideals of “freedom and innovation,” the education at Ritsumeikan University has allowed students to acquire various skills and values that are synonymous with a private-school environment. Over the years, such efforts have cultivated and sent a wealth of human resources out into the world. However, in the coming years, we must also focus on education that develops creative individuals that modern society needs. With this in mind, we must not only encourage a high-level of classroom learning, but also actively involve students in cutting-edge research activities conducted by our laboratories, encourage students to expose themselves to new challenges, and produce individuals capable of facing and resolving the tough challenges in the real world."
length(txt)
## [1] 7
wordL<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordL<-unlist(wordL)
wordL<-tolower(wordL)
wordL<-wordL[nchar(wordL)>0]
wordL<- wordL[wordL != ""]
tokens <- length(wordL)
tokens
## [1] 353
types <- length(unique(wordL))
types
## [1] 188
\[ TTR=\frac{types}{tokens} \times 100 \]
types/tokens*100
## [1] 53.25779
Tokens | Types | TTR (%) |
---|---|---|
353 | 188 | 53.26 |
freqL<-sort(table(wordL), decreasing=TRUE)
freqL[1:5]
## wordL
## the of and to in
## 28 16 12 11 9
subfreq <-freqL[1:10]
title="Word Frequency Distribution"
xlabel="Word"
ylabel="Frequency"
barplot(subfreq, main=title, xlab=xlabel, ylab=ylabel)
barplot(subfreq,col=rainbow(20), main=title, xlab=xlabel, ylab=ylabel)
colors()[1:10]
## [1] "white" "aliceblue" "antiquewhite" "antiquewhite1"
## [5] "antiquewhite2" "antiquewhite3" "antiquewhite4" "aquamarine"
## [9] "aquamarine1" "aquamarine2"
bar<-barplot(subfreq,col="blue", main=title, xlab=xlabel, ylab=ylabel,las=3)
#text(bar, par("usr")[3] - 0.09, srt = 90, adj = 1, labels = as.character(rownames(freqLst[1:10])), xpd = TRUE, offset = 1, col = "blue")
全体を100としたときの出現率
relative <- freqL / sum(freqL)*100
sum(relative)
## [1] 100
round(relative,2)
## wordL
## the of and to in
## 7.93 4.53 3.40 3.12 2.55
## research that ritsumeikan we a
## 1.98 1.98 1.42 1.42 1.13
## are at individuals students with
## 1.13 1.13 1.13 1.13 1.13
## world also environment for is
## 1.13 0.85 0.85 0.85 0.85
## it must on society university
## 0.85 0.85 0.85 0.85 0.85
## activities as capable centers challenges
## 0.57 0.57 0.57 0.57 0.57
## changes co contribute current cutting
## 0.57 0.57 0.57 0.57 0.57
## development edge education encourage establishment
## 0.57 0.57 0.57 0.57 0.57
## global has human innovation its
## 0.57 0.57 0.57 0.57 0.57
## natural nature needs order our
## 0.57 0.57 0.57 0.57 0.57
## produce resources science sciences such
## 0.57 0.57 0.57 0.57 0.57
## technology they various will years
## 0.57 0.57 0.57 0.57 0.57
## 21st achievements acquire actively allowed
## 0.28 0.28 0.28 0.28 0.28
## although aware based believe burdens
## 0.28 0.28 0.28 0.28 0.28
## but by can century changing
## 0.28 0.28 0.28 0.28 0.28
## class classroom coming competition conducted
## 0.28 0.28 0.28 0.28 0.28
## creative critical cultivated developing develops
## 0.28 0.28 0.28 0.28 0.28
## efforts engaged exist existing expose
## 0.28 0.28 0.28 0.28 0.28
## facilitated facing focus founding freedom
## 0.28 0.28 0.28 0.28 0.28
## from fulfill further future gain
## 0.28 0.28 0.28 0.28 0.28
## generation giro globalization have high
## 0.28 0.28 0.28 0.28 0.28
## highest hope however humanities humankind
## 0.28 0.28 0.28 0.28 0.28
## humans i ideals importance important
## 0.28 0.28 0.28 0.28 0.28
## improve improving including institutes international
## 0.28 0.28 0.28 0.28 0.28
## into involve keep laboratories learning
## 0.28 0.28 0.28 0.28 0.28
## level levels many match mind
## 0.28 0.28 0.28 0.28 0.28
## modern my necessary new not
## 0.28 0.28 0.28 0.28 0.28
## obligated obtained occur only organization
## 0.28 0.28 0.28 0.28 0.28
## out over overcome overseas peacefully
## 0.28 0.28 0.28 0.28 0.28
## per potential powerful private produced
## 0.28 0.28 0.28 0.28 0.28
## provide r real realization recognize
## 0.28 0.28 0.28 0.28 0.28
## remaining requirements resolving responsible s
## 0.28 0.28 0.28 0.28 0.28
## same school sent since sincere
## 0.28 0.28 0.28 0.28 0.28
## skills so social standards successfully
## 0.28 0.28 0.28 0.28 0.28
## surmount surrounding synonymous take teach
## 0.28 0.28 0.28 0.28 0.28
## themselves these this those through
## 0.28 0.28 0.28 0.28 0.28
## tide time tough up values
## 0.28 0.28 0.28 0.28 0.28
## wealth where while
## 0.28 0.28 0.28
### Plot関数
plot(freqL)
title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
plot(freqL, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
K=30
A=0.75
rank=seq(1,length(freqL))
rank
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## [137] 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
## [154] 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
## [171] 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
## [188] 188
zipf = unlist(lapply(rank, function(r) K/r^A))
title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(freqL, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100))
plot(zipf, log="xy", type="l",col="red" , xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(freqL, log="xy", pch=8,col="darkgreen" , xlim=c(1,100),ylim=c(1,100))
legend(20,100,c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))
出現単語
words <- names(freqL)
words[1]
## [1] "the"
nchar(words[1])
## [1] 3
nchar関数:文字数(エクセルのlen関数と同じ)
wdLen <- unlist(lapply(words,nchar))
nchar関数:文字数(エクセルのlen関数と同じ)
wdLenFreq <- table(wdLen)
wdLenFreq
## wdLen
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 4 14 13 28 20 15 28 18 16 15 9 5 3
title="Word Length Frequency Distribution (Types)"
xlabel="Word Length"
ylabel="Frequency"
xmax=length(wdLenFreq)
plot(wdLenFreq, type="b",pch=8,col="orange" , xlim=c(1,xmax),ylim=c(1,30),main=title, xlab=xlabel, ylab=ylabel)
mFreq <- table(freqL)
tokens
## [1] 353
names(mFreq[3])
## [1] "3"
mFreq[3]
## 3
## 9
as.numeric(names(mFreq[3]))*mFreq[3]
## 3
## 27
as.numeric(names(mFreq[3]))*mFreq[3]
## 3
## 27
m2 <- mapply( function(x,y) as.numeric(x)^2*y,names(mFreq),mFreq)
K <- 10000*(sum(m2)-tokens)/tokens^2
K
## [1] 131.7722