# Lecture 5: Rでテキスト処理

## 基本操作

### 変数

#### 文字を代入

hoge <- "hogehoge"


#### 数字を代入

n1 <- 5
n2 <- 3


#### 四則演算

n1 + n2

## [1] 8

n1 - n2

## [1] 2

n1 * n2

## [1] 15

n1/n2

## [1] 1.667


### リスト（複数の要素）

#### リストの作成

test <- c("a", "ab", "abc")


#### 要素数を調べる

length(test)

## [1] 3


#### 要素を取り出す

length(test[2])

## [1] 1


#### 単語の文字数を調べる

nchar(test[3])

## [1] 3


## 単語出現頻度表の作成

#### テキストファイルの読み込み

txt <- readLines("rits.txt")


#### 一行目の内容

txt[1]

## [1] "Since its establishment, based on the founding ideals of “freedom and innovation,” the education at Ritsumeikan University has allowed students to acquire various skills and values that are synonymous with a private-school environment. Over the years, such efforts have cultivated and sent a wealth of human resources out into the world. However, in the coming years, we must also focus on education that develops creative individuals that modern society needs. With this in mind, we must not only encourage a high-level of classroom learning, but also actively involve students in cutting-edge research activities conducted by our laboratories, encourage students to expose themselves to new challenges, and produce individuals capable of facing and resolving the tough challenges in the real world."


#### 読み込んだ行数

length(txt)

## [1] 8


#### スペース&記号による分割

wordL <- strsplit(txt, "[[:space:]]|[[:punct:]]")


#### 各行のデータを一括化

wordL <- unlist(wordL)


#### 小文字に変換

wordL <- tolower(wordL)


#### 空白"“の削除

wordL <- wordL[nchar(wordL) > 0]
wordL <- wordL[wordL != ""]


#### 単語のToken数

tokens <- length(wordL)
tokens

## [1] 353


#### 単語のTypes数

• unique()関数は，リストの重複しない要素を返す
types <- length(unique(wordL))
types

## [1] 188


#### TTR: Type-Token Ratioの計算

$TTR=\frac{types}{tokens} \times 100$

types/tokens * 100

## [1] 53.26


#### テキストの基本情報

Tokens Types TTR (%)
353 188 53.26

#### 単語の頻度数

freqL <- sort(table(wordL), decreasing = TRUE)


#### 単語の頻度数(上位5語)

freqL[1:5]

## wordL
## the  of and  to  in
##  28  16  12  11   9


#### 単語頻度数分布

subfreq <- freqL[1:10]
title = "Word Frequency Distribution"
xlabel = "Word"
ylabel = "Frequency"
barplot(subfreq, main = title, xlab = xlabel, ylab = ylabel)


#### 単語頻度数分布(色付き)

barplot(subfreq, col = rainbow(20), main = title, xlab = xlabel, ylab = ylabel)


#### 色の種類

colors()[1:10]

##  [1] "white"         "aliceblue"     "antiquewhite"  "antiquewhite1"
##  [5] "antiquewhite2" "antiquewhite3" "antiquewhite4" "aquamarine"
##  [9] "aquamarine1"   "aquamarine2"


#### 単語頻度数分布(単語ラベルの角度)

bar <- barplot(subfreq, col = "blue", main = title, xaxt = "n", xlab = "", ylab = ylabel)


text(bar, par("usr")[3] - 0.09, srt = 90, adj = 1, labels = as.character(rownames(freqLst[1:10])),
xpd = TRUE, offset = 1, col = "blue")

## Error: オブジェクト 'freqLst' がありません


#### 相対頻度数

relative <- freqL/sum(freqL) * 100


#### 相対頻度の合計

sum(relative)

## [1] 100


#### 小数点

round(relative, 2)

## wordL
##           the            of           and            to            in
##          7.93          4.53          3.40          3.12          2.55
##      research          that   ritsumeikan            we             a
##          1.98          1.98          1.42          1.42          1.13
##           are            at   individuals      students          with
##          1.13          1.13          1.13          1.13          1.13
##         world          also   environment           for            is
##          1.13          0.85          0.85          0.85          0.85
##            it          must            on       society    university
##          0.85          0.85          0.85          0.85          0.85
##    activities            as       capable       centers    challenges
##          0.57          0.57          0.57          0.57          0.57
##       changes            co    contribute       current       cutting
##          0.57          0.57          0.57          0.57          0.57
##   development          edge     education     encourage establishment
##          0.57          0.57          0.57          0.57          0.57
##        global           has         human    innovation           its
##          0.57          0.57          0.57          0.57          0.57
##       natural        nature         needs         order           our
##          0.57          0.57          0.57          0.57          0.57
##       produce     resources       science      sciences          such
##          0.57          0.57          0.57          0.57          0.57
##    technology          they       various          will         years
##          0.57          0.57          0.57          0.57          0.57
##          21st  achievements       acquire      actively       allowed
##          0.28          0.28          0.28          0.28          0.28
##      although         aware         based       believe       burdens
##          0.28          0.28          0.28          0.28          0.28
##           but            by           can       century      changing
##          0.28          0.28          0.28          0.28          0.28
##         class     classroom        coming   competition     conducted
##          0.28          0.28          0.28          0.28          0.28
##      creative      critical    cultivated    developing      develops
##          0.28          0.28          0.28          0.28          0.28
##       efforts       engaged         exist      existing        expose
##          0.28          0.28          0.28          0.28          0.28
##   facilitated        facing         focus      founding       freedom
##          0.28          0.28          0.28          0.28          0.28
##          from       fulfill       further        future          gain
##          0.28          0.28          0.28          0.28          0.28
##    generation          giro globalization          have          high
##          0.28          0.28          0.28          0.28          0.28
##       highest          hope       however    humanities     humankind
##          0.28          0.28          0.28          0.28          0.28
##        humans             i        ideals    importance     important
##          0.28          0.28          0.28          0.28          0.28
##       improve     improving     including    institutes international
##          0.28          0.28          0.28          0.28          0.28
##          into       involve          keep  laboratories      learning
##          0.28          0.28          0.28          0.28          0.28
##         level        levels          many         match          mind
##          0.28          0.28          0.28          0.28          0.28
##        modern            my     necessary           new           not
##          0.28          0.28          0.28          0.28          0.28
##     obligated      obtained         occur          only  organization
##          0.28          0.28          0.28          0.28          0.28
##           out          over      overcome      overseas    peacefully
##          0.28          0.28          0.28          0.28          0.28
##           per     potential      powerful       private      produced
##          0.28          0.28          0.28          0.28          0.28
##       provide             r          real   realization     recognize
##          0.28          0.28          0.28          0.28          0.28
##     remaining  requirements     resolving   responsible             s
##          0.28          0.28          0.28          0.28          0.28
##          same        school          sent         since       sincere
##          0.28          0.28          0.28          0.28          0.28
##        skills            so        social     standards  successfully
##          0.28          0.28          0.28          0.28          0.28
##      surmount   surrounding    synonymous          take         teach
##          0.28          0.28          0.28          0.28          0.28
##    themselves         these          this         those       through
##          0.28          0.28          0.28          0.28          0.28
##          tide          time         tough            up        values
##          0.28          0.28          0.28          0.28          0.28
##        wealth         where         while
##          0.28          0.28          0.28


## 散布図(Plot関数)

###　Plot関数

plot(freqL)


#### 頻度散布図

title = "Word Frequency Distribution"
xlabel = "Rank"
ylabel = "Frequency"
plot(freqL, log = "xy", pch = 8, col = "darkgreen", xlim = c(1, 100), ylim = c(1,
100), main = title, xlab = xlabel, ylab = ylabel)


#### Zipf'sの法則

K = 30
A = 0.75
rank = seq(1, length(freqL))
rank

##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## [137] 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
## [154] 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
## [171] 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
## [188] 188

zipf = unlist(lapply(rank, function(r) K/r^A))
title = "Word Frequency Distribution"
xlabel = "Rank"
ylabel = "Frequency"
plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, 100), ylim = c(1,
100), main = title, xlab = xlabel, ylab = ylabel)


#### 頻度散布図＆Zipf'sの理論式の重ね書き

par(new = T)
plot(freqL, log = "xy", pch = 8, col = "darkgreen", xlim = c(1, 100), ylim = c(1,
100))


#### 凡例

plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, 100), ylim = c(1,
100), main = title, xlab = xlabel, ylab = ylabel)
par(new = T)
plot(freqL, log = "xy", pch = 8, col = "darkgreen", xlim = c(1, 100), ylim = c(1,
100))
legend(20, 100, c("Frequency", "Zipf's\nlaw"), lty = c(NA, 1), pch = c(8, NA),
col = c("darkgreen", "red"))


## 単語長頻度分布

words <- names(freqL)
words[1]

## [1] "the"

nchar(words[1])

## [1] 3


#### 単語の文字数計算

nchar関数：文字数（エクセルのlen関数と同じ）

wdLen <- unlist(lapply(words, nchar))


#### 単語の文字数計算

nchar関数：文字数（エクセルのlen関数と同じ）

wdLenFreq <- table(wdLen)
wdLenFreq

## wdLen
##  1  2  3  4  5  6  7  8  9 10 11 12 13
##  4 14 13 28 20 15 28 18 16 15  9  5  3


#### 単語長分布（Types）

title = "Word Length Frequency Distribution (Types)"
xlabel = "Word Length"
ylabel = "Frequency"
xmax = length(wdLenFreq)
plot(wdLenFreq, type = "b", pch = 8, col = "orange", xlim = c(1, xmax), ylim = c(1,
30), main = title, xlab = xlabel, ylab = ylabel)


## Yule'sのK特性値

### 頻度スペクトラム

mFreq <- table(freqL)


#### 単語のToken数

tokens

## [1] 353


#### 頻度パタン(m)とその頻度

names(mFreq[3])

## [1] "3"

mFreq[3]

## 3
## 9

as.numeric(names(mFreq[3])) * mFreq[3]

##  3
## 27


#### Yule'sのK特性値の部分計算

as.numeric(names(mFreq[3])) * mFreq[3]

##  3
## 27

m2 <- mapply(function(x, y) as.numeric(x)^2 * y, names(mFreq), mFreq)


### Yule'sのK特性値

K <- 10000 * (sum(m2) - tokens)/tokens^2
K

## [1] 131.8