Lecture 5: Rでテキスト処理

基本操作

変数

文字を代入

hoge <- "hogehoge"

数字を代入

n1 <- 5
n2 <- 3

四則演算

n1 + n2
## [1] 8
n1 - n2
## [1] 2
n1 * n2
## [1] 15
n1/n2
## [1] 1.667

リスト(複数の要素)

リストの作成

test <- c("a", "ab", "abc")

要素数を調べる

length(test)
## [1] 3

要素を取り出す

length(test[2])
## [1] 1

単語の文字数を調べる

nchar(test[3])
## [1] 3

単語出現頻度表の作成

テキストファイルの読み込み

一行ずつ読み込んで、リストに格納

txt <- readLines("rits.txt")

一行目の内容

txt[1]
## [1] "Since its establishment, based on the founding ideals of “freedom and innovation,” the education at Ritsumeikan University has allowed students to acquire various skills and values that are synonymous with a private-school environment. Over the years, such efforts have cultivated and sent a wealth of human resources out into the world. However, in the coming years, we must also focus on education that develops creative individuals that modern society needs. With this in mind, we must not only encourage a high-level of classroom learning, but also actively involve students in cutting-edge research activities conducted by our laboratories, encourage students to expose themselves to new challenges, and produce individuals capable of facing and resolving the tough challenges in the real world."

読み込んだ行数

length(txt)
## [1] 8

スペース&記号による分割

wordL <- strsplit(txt, "[[:space:]]|[[:punct:]]")

各行のデータを一括化

wordL <- unlist(wordL)

小文字に変換

wordL <- tolower(wordL)

空白"“の削除

wordL <- wordL[nchar(wordL) > 0]
wordL <- wordL[wordL != ""]

単語のToken数

tokens <- length(wordL)
tokens
## [1] 353

単語のTypes数

types <- length(unique(wordL))
types
## [1] 188

TTR: Type-Token Ratioの計算

\[ TTR=\frac{types}{tokens} \times 100 \]

types/tokens * 100
## [1] 53.26

テキストの基本情報

Tokens Types TTR (%)
353 188 53.26

単語の頻度数

freqL <- sort(table(wordL), decreasing = TRUE)

単語の頻度数(上位5語)

freqL[1:5]
## wordL
## the  of and  to  in 
##  28  16  12  11   9

単語頻度数分布

subfreq <- freqL[1:10]
title = "Word Frequency Distribution"
xlabel = "Word"
ylabel = "Frequency"
barplot(subfreq, main = title, xlab = xlabel, ylab = ylabel)

plot of chunk unnamed-chunk-20

単語頻度数分布(色付き)

barplot(subfreq, col = rainbow(20), main = title, xlab = xlabel, ylab = ylabel)

plot of chunk unnamed-chunk-21

色の種類

colors()[1:10]
##  [1] "white"         "aliceblue"     "antiquewhite"  "antiquewhite1"
##  [5] "antiquewhite2" "antiquewhite3" "antiquewhite4" "aquamarine"   
##  [9] "aquamarine1"   "aquamarine2"

単語頻度数分布(単語ラベルの角度)

bar <- barplot(subfreq, col = "blue", main = title, xaxt = "n", xlab = "", ylab = ylabel)

plot of chunk unnamed-chunk-23

text(bar, par("usr")[3] - 0.09, srt = 90, adj = 1, labels = as.character(rownames(freqLst[1:10])), 
    xpd = TRUE, offset = 1, col = "blue")
## Error: オブジェクト 'freqLst' がありません

相対頻度数

全体を100としたときの出現率

relative <- freqL/sum(freqL) * 100

相対頻度の合計

sum(relative)
## [1] 100

小数点

round(relative, 2)
## wordL
##           the            of           and            to            in 
##          7.93          4.53          3.40          3.12          2.55 
##      research          that   ritsumeikan            we             a 
##          1.98          1.98          1.42          1.42          1.13 
##           are            at   individuals      students          with 
##          1.13          1.13          1.13          1.13          1.13 
##         world          also   environment           for            is 
##          1.13          0.85          0.85          0.85          0.85 
##            it          must            on       society    university 
##          0.85          0.85          0.85          0.85          0.85 
##    activities            as       capable       centers    challenges 
##          0.57          0.57          0.57          0.57          0.57 
##       changes            co    contribute       current       cutting 
##          0.57          0.57          0.57          0.57          0.57 
##   development          edge     education     encourage establishment 
##          0.57          0.57          0.57          0.57          0.57 
##        global           has         human    innovation           its 
##          0.57          0.57          0.57          0.57          0.57 
##       natural        nature         needs         order           our 
##          0.57          0.57          0.57          0.57          0.57 
##       produce     resources       science      sciences          such 
##          0.57          0.57          0.57          0.57          0.57 
##    technology          they       various          will         years 
##          0.57          0.57          0.57          0.57          0.57 
##          21st  achievements       acquire      actively       allowed 
##          0.28          0.28          0.28          0.28          0.28 
##      although         aware         based       believe       burdens 
##          0.28          0.28          0.28          0.28          0.28 
##           but            by           can       century      changing 
##          0.28          0.28          0.28          0.28          0.28 
##         class     classroom        coming   competition     conducted 
##          0.28          0.28          0.28          0.28          0.28 
##      creative      critical    cultivated    developing      develops 
##          0.28          0.28          0.28          0.28          0.28 
##       efforts       engaged         exist      existing        expose 
##          0.28          0.28          0.28          0.28          0.28 
##   facilitated        facing         focus      founding       freedom 
##          0.28          0.28          0.28          0.28          0.28 
##          from       fulfill       further        future          gain 
##          0.28          0.28          0.28          0.28          0.28 
##    generation          giro globalization          have          high 
##          0.28          0.28          0.28          0.28          0.28 
##       highest          hope       however    humanities     humankind 
##          0.28          0.28          0.28          0.28          0.28 
##        humans             i        ideals    importance     important 
##          0.28          0.28          0.28          0.28          0.28 
##       improve     improving     including    institutes international 
##          0.28          0.28          0.28          0.28          0.28 
##          into       involve          keep  laboratories      learning 
##          0.28          0.28          0.28          0.28          0.28 
##         level        levels          many         match          mind 
##          0.28          0.28          0.28          0.28          0.28 
##        modern            my     necessary           new           not 
##          0.28          0.28          0.28          0.28          0.28 
##     obligated      obtained         occur          only  organization 
##          0.28          0.28          0.28          0.28          0.28 
##           out          over      overcome      overseas    peacefully 
##          0.28          0.28          0.28          0.28          0.28 
##           per     potential      powerful       private      produced 
##          0.28          0.28          0.28          0.28          0.28 
##       provide             r          real   realization     recognize 
##          0.28          0.28          0.28          0.28          0.28 
##     remaining  requirements     resolving   responsible             s 
##          0.28          0.28          0.28          0.28          0.28 
##          same        school          sent         since       sincere 
##          0.28          0.28          0.28          0.28          0.28 
##        skills            so        social     standards  successfully 
##          0.28          0.28          0.28          0.28          0.28 
##      surmount   surrounding    synonymous          take         teach 
##          0.28          0.28          0.28          0.28          0.28 
##    themselves         these          this         those       through 
##          0.28          0.28          0.28          0.28          0.28 
##          tide          time         tough            up        values 
##          0.28          0.28          0.28          0.28          0.28 
##        wealth         where         while 
##          0.28          0.28          0.28

散布図(Plot関数)

### Plot関数

plot(freqL)

plot of chunk unnamed-chunk-27

頻度散布図

title = "Word Frequency Distribution"
xlabel = "Rank"
ylabel = "Frequency"
plot(freqL, log = "xy", pch = 8, col = "darkgreen", xlim = c(1, 100), ylim = c(1, 
    100), main = title, xlab = xlabel, ylab = ylabel)

plot of chunk unnamed-chunk-28

Zipf'sの法則

K = 30
A = 0.75
rank = seq(1, length(freqL))
rank
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## [137] 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
## [154] 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
## [171] 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
## [188] 188
zipf = unlist(lapply(rank, function(r) K/r^A))
title = "Word Frequency Distribution"
xlabel = "Rank"
ylabel = "Frequency"
plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, 100), ylim = c(1, 
    100), main = title, xlab = xlabel, ylab = ylabel)

plot of chunk unnamed-chunk-29

頻度散布図&Zipf'sの理論式の重ね書き

par(new = T)
plot(freqL, log = "xy", pch = 8, col = "darkgreen", xlim = c(1, 100), ylim = c(1, 
    100))

凡例

plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, 100), ylim = c(1, 
    100), main = title, xlab = xlabel, ylab = ylabel)
par(new = T)
plot(freqL, log = "xy", pch = 8, col = "darkgreen", xlim = c(1, 100), ylim = c(1, 
    100))
legend(20, 100, c("Frequency", "Zipf's\nlaw"), lty = c(NA, 1), pch = c(8, NA), 
    col = c("darkgreen", "red"))

plot of chunk unnamed-chunk-31


単語長頻度分布

出現単語

words <- names(freqL)
words[1]
## [1] "the"
nchar(words[1])
## [1] 3

単語の文字数計算

nchar関数:文字数(エクセルのlen関数と同じ)

wdLen <- unlist(lapply(words, nchar))

単語の文字数計算

nchar関数:文字数(エクセルのlen関数と同じ)

wdLenFreq <- table(wdLen)
wdLenFreq
## wdLen
##  1  2  3  4  5  6  7  8  9 10 11 12 13 
##  4 14 13 28 20 15 28 18 16 15  9  5  3

単語長分布(Types)

title = "Word Length Frequency Distribution (Types)"
xlabel = "Word Length"
ylabel = "Frequency"
xmax = length(wdLenFreq)
plot(wdLenFreq, type = "b", pch = 8, col = "orange", xlim = c(1, xmax), ylim = c(1, 
    30), main = title, xlab = xlabel, ylab = ylabel)

plot of chunk unnamed-chunk-35


Yule'sのK特性値

頻度スペクトラム

mFreq <- table(freqL)

単語のToken数

tokens
## [1] 353

頻度パタン(m)とその頻度

names(mFreq[3])
## [1] "3"
mFreq[3]
## 3 
## 9
as.numeric(names(mFreq[3])) * mFreq[3]
##  3 
## 27

Yule'sのK特性値の部分計算

as.numeric(names(mFreq[3])) * mFreq[3]
##  3 
## 27
m2 <- mapply(function(x, y) as.numeric(x)^2 * y, names(mFreq), mFreq)

Yule'sのK特性値

K <- 10000 * (sum(m2) - tokens)/tokens^2
K
## [1] 131.8