パッケージの読み込み

manipulate package

library(manipulate)

前回の実習について(別ファイル参照)

指定ディレクトリのファイル一覧を取得(相対パス)

fLst<- list.files("nhk_news")
fLst
[1] "nhk_news_1.txt" "nhk_news_2.txt"
[3] "nhk_news_3.txt"

Zipf’sの法則

\[Frequency=\frac{K}{Rank^A} \] K,A: 定数

テキストファイル”GSLC2021.txt”の単語頻度データを取得

freqData <- getFreq("GSLC2021.txt")
dim(freqData)
[1] 271   1
head(freqData)

頻度統計情報

sum(freqData$Freq)
[1] 578
max(freqData$Freq)
[1] 37
min(freqData$Freq)
[1] 1
# Mean & Median
mean(freqData$Freq)
[1] 2.132841
median(freqData$Freq)
[1] 1
# Variance & Standard Deviation
var(freqData$Freq)
[1] 15.3971
sd(freqData$Freq)
[1] 3.923914
K=freqData[1,1]
A=0.8

rank <- seq(1:dim(freqData)[1])
zipf <- K/rank^A

グラフ描画

凡例: legend

配置:“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center”
ラベル
lty: 線の種類
pch: プロットの種類
#理論値
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqData)),ylim=c(1,50),main="Zipf's Law", xlab="Rank", ylab="Frequency")

#頻度散布図の重ね書き
par(new=T)
plot(rank,freqData[,1], xlim=c(1,nrow(freqData)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main="Zipf's Law", xlab="Rank", ylab="Frequency")

#凡例
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqData)),ylim=c(1,50),main="Zipf's Law", xlab="Rank", ylab="Frequency")
par(new=T)
plot(rank,freqData[,1], xlim=c(1,nrow(freqData)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main="Zipf's Law", xlab="Rank", ylab="Frequency")
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))

課題2 (締め切り12月7日)

Manipulat関数を使用し、上記の変数freqDataの粗頻度数とそのZipf’sの理論式の散布図に対して、定数AとKをインタラクティブに変更できる図を作成しなさい。

\[Frequency=\frac{K}{Rank^A} \] K,A: 定数

作成上の注意

  • 定数Aの値:範囲0.5-1.5, 初期値=0.8, step=0.05(0.1に設定しても構いません)
  • 定数Kの値:範囲10-100, 初期値= 単語出現頻度の最大値, step=10

apply関数

単語情報の一部を抽出

tmp<-rownames(freqData)[1:10]

lapply関数

  • paste
paste(tmp, "@GSLC2021")
 [1] "the @GSLC2021"      "and @GSLC2021"     
 [3] "of @GSLC2021"       "in @GSLC2021"      
 [5] "to @GSLC2021"       "language @GSLC2021"
 [7] "research @GSLC2021" "culture @GSLC2021" 
 [9] "by @GSLC2021"       "with @GSLC2021"    

lapply

lapply(tmp, paste, "@GSLC2021")
[[1]]
[1] "the @GSLC2021"

[[2]]
[1] "and @GSLC2021"

[[3]]
[1] "of @GSLC2021"

[[4]]
[1] "in @GSLC2021"

[[5]]
[1] "to @GSLC2021"

[[6]]
[1] "language @GSLC2021"

[[7]]
[1] "research @GSLC2021"

[[8]]
[1] "culture @GSLC2021"

[[9]]
[1] "by @GSLC2021"

[[10]]
[1] "with @GSLC2021"

sapply:名前の属性付き

sapply(tmp, paste, "@GSLC2021")
                 the                  and 
     "the @GSLC2021"      "and @GSLC2021" 
                  of                   in 
      "of @GSLC2021"       "in @GSLC2021" 
                  to             language 
      "to @GSLC2021" "language @GSLC2021" 
            research              culture 
"research @GSLC2021"  "culture @GSLC2021" 
                  by                 with 
      "by @GSLC2021"     "with @GSLC2021" 

apply関数

粗頻度と相対頻度をcbindで結合

tmpMtx <- cbind(freqData, freqData$Freq/sum(freqData$Freq))
colnames(tmpMtx)<- c("Freq", "RelativFreq")
head(tmpMtx)

Apply on rows

res <- apply(tmpMtx,1,sum)
head(res)
     the      and       of       in       to 
37.06401 31.05363 28.04844 20.03460 16.02768 
language 
15.02595 

Apply on columns

apply(tmpMtx,2,sum)
       Freq RelativFreq 
        578           1 

Apply on elements

res <- apply(tmpMtx,c(1,2), function(x) x*10)
head(res)
         Freq RelativFreq
the       370   0.6401384
and       310   0.5363322
of        280   0.4844291
in        200   0.3460208
to        160   0.2768166
language  150   0.2595156

RMeCab

RMeCabインストール(別ファイル参照)

path_home <- system("echo $HOME",intern=T)
lib_path <- paste(path_home,"/usr/local/lib/libmecab.so.2", sep="")

ライブラリー&パッケージの読み込み

dyn.load(lib_path)
library(RMeCab)

分かち書き&頻度表(テキストファイル使用)

freqNews1<-RMeCabFreq("nhk_news/nhk_news_1.txt")
file = nhk_news/nhk_news_1.txt 
length = 90 
freqNews1 <-freqNews1[order(freqNews1$Freq, decreasing = TRUE),]
head(freqNews1)

(同一ディレクトリ内)複数ファイルから頻度行列を作成

res1 <- docMatrix("nhk_news",  pos = c("名詞","助詞"))
file = nhk_news/nhk_news_1.txt
file = nhk_news/nhk_news_2.txt
file = nhk_news/nhk_news_3.txt
Term Document Matrix includes 2 information rows! 
whose names are [[LESS-THAN-1]] and [[TOTAL-TOKENS]]
if you remove these rows, run
result[ rownames(result) !=  "[[LESS-THAN-1]]" , ]
result[ rownames(result) !=  "[[TOTAL-TOKENS]]" , ]
head(res1)
                  docs
terms              nhk_news_1.txt nhk_news_2.txt
  .                             0              1
  [[LESS-THAN-1]]               0              0
  [[TOTAL-TOKENS]]            154            192
  %                            0              1
  1                             1              0
  2                             0              0
                  docs
terms              nhk_news_3.txt
  .                             0
  [[LESS-THAN-1]]               0
  [[TOTAL-TOKENS]]            148
  %                            0
  1                             2
  2                             1

minFreq

res2 <- docMatrix("nhk_news",  pos = c("名詞","助詞","動詞") , minFreq=5)
file = nhk_news/nhk_news_1.txt
file = nhk_news/nhk_news_2.txt
file = nhk_news/nhk_news_3.txt
Term Document Matrix includes 2 information rows! 
whose names are [[LESS-THAN-5]] and [[TOTAL-TOKENS]]
if you remove these rows, run
result[ rownames(result) !=  "[[LESS-THAN-5]]" , ]
result[ rownames(result) !=  "[[TOTAL-TOKENS]]" , ]
head(res2)
                  docs
terms              nhk_news_1.txt nhk_news_2.txt
  [[LESS-THAN-5]]              95             98
  [[TOTAL-TOKENS]]            154            192
  アメリカ                      0              0
  いる                          0              7
  が                            0             11
  カブトムシ                    5              0
                  docs
terms              nhk_news_3.txt
  [[LESS-THAN-5]]              78
  [[TOTAL-TOKENS]]            148
  アメリカ                      6
  いる                          0
  が                            8
  カブトムシ                    0

Term Frequency-Inverse Document Frequency

  • テキスト特有の出現単語に対して、重みづけをする
  • テキストに共通する単語に対しては(低い)重み付け

TF-IDF 1

\[w=tf*log(\frac{N}{df}) \]

TF-IDF 2

\[w=tf*(log(\frac{N}{df})+1) \]

RMeCabの関数を利用

res3 <- docMatrix("nhk_news",  pos = c("名詞","助詞","動詞") , minFreq=5, weight = "tf*idf")
file = nhk_news/nhk_news_1.txt
file = nhk_news/nhk_news_2.txt
file = nhk_news/nhk_news_3.txt
head(res3)
            docs
terms        nhk_news_1.txt nhk_news_2.txt
  アメリカ         0.000000       0.000000
  いる             0.000000      18.094738
  が               0.000000      17.434588
  カブトムシ      12.924813       0.000000
  する             9.509775       9.509775
  ツイッター       0.000000       0.000000
            docs
terms        nhk_news_3.txt
  アメリカ        15.509775
  いる             0.000000
  が              12.679700
  カブトムシ       0.000000
  する             0.000000
  ツイッター      15.509775
LS0tCnRpdGxlOiAiTGVjMDc6IEFwcGx56Zai5pWwLCBSTWVDYWIiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMjIOODkeODg+OCseODvOOCuOOBruiqreOBv+i+vOOBvwojIyMgbWFuaXB1bGF0ZSBwYWNrYWdlCmBgYHtyLGVycm9yPUZBTFNFLGV2YWw9RkFMU0V9CmxpYnJhcnkobWFuaXB1bGF0ZSkKYGBgCgojIyMg5YmN5Zue44Gu5a6f57+S44Gr44Gk44GE44Gm77yI5Yil44OV44Kh44Kk44Or5Y+C54Wn77yJCisgPGEgaHJlZj0iaHR0cHM6Ly93d3cubGVhcm5ieWV4YW1wbGUub3JnL3ItcGxvdC1mdW5jdGlvbi8iIHRhcmdldD0iX2JsYW5rIj5wbG905YaF5byV5pWwPC9hPgoKIyMg5oyH5a6a44OH44Kj44Os44Kv44OI44Oq44Gu44OV44Kh44Kk44Or5LiA6Kan44KS5Y+W5b6XKOebuOWvvuODkeOCuSkKYGBge3J9CmZMc3Q8LSBsaXN0LmZpbGVzKCJuaGtfbmV3cyIpCmZMc3QKYGBgCgojIyA8YSBocmVmPSJodHRwczovL3d3dzEuZG9zaGlzaGEuYWMuanAvfm1qaW4vUi82MC82MC5odG1sIiB0YXJnZXQ9Il9ibGFuayI+WmlwZidz44Gu5rOV5YmHPC9hPgokJEZyZXF1ZW5jeT1cZnJhY3tLfXtSYW5rXkF9ICAkJApLLEE6IOWumuaVsAoKIyMjIOODhuOCreOCueODiOODleOCoeOCpOODqyJHU0xDMjAyMS50eHQi44Gu5Y2Y6Kqe6aC75bqm44OH44O844K/44KS5Y+W5b6XCmBgYHtyfQpmcmVxRGF0YSA8LSBnZXRGcmVxKCJHU0xDMjAyMS50eHQiKQpkaW0oZnJlcURhdGEpCmhlYWQoZnJlcURhdGEpCmBgYAojIyMg6aC75bqm57Wx6KiI5oOF5aCxCmBgYHtyfQpzdW0oZnJlcURhdGEkRnJlcSkKbWF4KGZyZXFEYXRhJEZyZXEpCm1pbihmcmVxRGF0YSRGcmVxKQojIE1lYW4gJiBNZWRpYW4KbWVhbihmcmVxRGF0YSRGcmVxKQptZWRpYW4oZnJlcURhdGEkRnJlcSkKIyBWYXJpYW5jZSAmIFN0YW5kYXJkIERldmlhdGlvbgp2YXIoZnJlcURhdGEkRnJlcSkKc2QoZnJlcURhdGEkRnJlcSkKYGBgCgpgYGB7cn0KSz1mcmVxRGF0YVsxLDFdCkE9MC44CgpyYW5rIDwtIHNlcSgxOmRpbShmcmVxRGF0YSlbMV0pCnppcGYgPC0gSy9yYW5rXkEKYGBgCgojIyMg44Kw44Op44OV5o+P55S7CiMjIyMg5Yeh5L6LOiA8YSBocmVmPSJodHRwczovL3N0YXRzLmJpb3BhcHlydXMuanAvci9ncmFwaC9sZWdlbmQuaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPmxlZ2VuZDwvYT4KYGBgCumFjee9ru+8muKAnGJvdHRvbXJpZ2h04oCdLCDigJxib3R0b23igJ0sIOKAnGJvdHRvbWxlZnTigJ0sIOKAnGxlZnTigJ0sIOKAnHRvcGxlZnTigJ0sIOKAnHRvcOKAnSwg4oCcdG9wcmlnaHTigJ0sIOKAnHJpZ2h04oCdLCDigJxjZW50ZXLigJ0K44Op44OZ44OrCmx0eTog57ea44Gu56iu6aGeCnBjaDog44OX44Ot44OD44OI44Gu56iu6aGeCmBgYApgYGB7ciwgZXZhbD1GQUxTRX0KI+eQhuirluWApApwbG90KHppcGYsIGxvZz0ieHkiLCB0eXBlPSJsIixjb2w9InJlZCIgLAp4bGltPWMoMSxucm93KGZyZXFEYXRhKSkseWxpbT1jKDEsNTApLG1haW49IlppcGYncyBMYXciLCB4bGFiPSJSYW5rIiwgeWxhYj0iRnJlcXVlbmN5IikKCiPpoLvluqbmlaPluIPlm7Pjga7ph43jga3mm7jjgY0KcGFyKG5ldz1UKQpwbG90KHJhbmssZnJlcURhdGFbLDFdLCB4bGltPWMoMSxucm93KGZyZXFEYXRhKSksIHlsaW09YygxLDUwKSxsb2c9Inh5IixwY2g9OCwgY29sPSJkYXJrZ3JlZW4iLCBtYWluPSJaaXBmJ3MgTGF3IiwgeGxhYj0iUmFuayIsIHlsYWI9IkZyZXF1ZW5jeSIpCgoj5Yeh5L6LCmxlZ2VuZCgidG9wcmlnaHQiLGMoIkZyZXF1ZW5jeSIsIlppcGYncyBsYXciKSxsdHk9YyhOQSwxKSxwY2g9Yyg4LE5BKSxjb2w9YygiZGFya2dyZWVuIiwicmVkIikpCgpgYGAKCmBgYHtyfQpwbG90KHppcGYsIGxvZz0ieHkiLCB0eXBlPSJsIixjb2w9InJlZCIgLAp4bGltPWMoMSxucm93KGZyZXFEYXRhKSkseWxpbT1jKDEsNTApLG1haW49IlppcGYncyBMYXciLCB4bGFiPSJSYW5rIiwgeWxhYj0iRnJlcXVlbmN5IikKcGFyKG5ldz1UKQpwbG90KHJhbmssZnJlcURhdGFbLDFdLCB4bGltPWMoMSxucm93KGZyZXFEYXRhKSksIHlsaW09YygxLDUwKSxsb2c9Inh5IixwY2g9OCwgY29sPSJkYXJrZ3JlZW4iLCBtYWluPSJaaXBmJ3MgTGF3IiwgeGxhYj0iUmFuayIsIHlsYWI9IkZyZXF1ZW5jeSIpCmxlZ2VuZCgidG9wcmlnaHQiLGMoIkZyZXF1ZW5jeSIsIlppcGYncyBsYXciKSxsdHk9YyhOQSwxKSxwY2g9Yyg4LE5BKSxjb2w9YygiZGFya2dyZWVuIiwicmVkIikpCmBgYAojIyDoqrLpoYwyICjnt6DjgoHliIfjgooxMuaciDfml6UpCi0g6Kqy6aGM44Gv44CBUlN0dWRpbyBDbG91ZOOBruWQhOiHquOBrldvcmtzcGFjZeOBp+eiuuiqjeOBl+OBvuOBmeOBruOBp+OAgeiqsumhjOOBjOOBp+OBjeOBn+OCieengeOBq+efpeOCieOBm+OBpuOBj+OBoOOBleOBhOOAggoKIyMjIE1hbmlwdWxhdOmWouaVsOOCkuS9v+eUqOOBl+OAgeS4iuiomOOBruWkieaVsGZyZXFEYXRh44Gu57KX6aC75bqm5pWw44Go44Gd44GuWmlwZuKAmXPjga7nkIboq5blvI/jga7mlaPluIPlm7Pjgavlr77jgZfjgabjgIHlrprmlbBB44GoS+OCkuOCpOODs+OCv+ODqeOCr+ODhuOCo+ODluOBq+WkieabtOOBp+OBjeOCi+Wbs+OCkuS9nOaIkOOBl+OBquOBleOBhOOAggokJEZyZXF1ZW5jeT1cZnJhY3tLfXtSYW5rXkF9ICAkJApLLEE6IOWumuaVsAoKIyMjIyDkvZzmiJDkuIrjga7ms6jmhI8KKyDlrprmlbBB44Gu5YCk77ya56+E5ZuyMC41LTEuNSwg5Yid5pyf5YCkPTAuOCwgc3RlcD0wLjA177yIMC4x44Gr6Kit5a6a44GX44Gm44KC5qeL44GE44G+44Gb44KT77yJCisg5a6a5pWwS+OBruWApO+8muevhOWbsjEwLTEwMCwg5Yid5pyf5YCkPSDljZjoqp7lh7rnj77poLvluqbjga7mnIDlpKflgKQsIHN0ZXA9MTAKCiMjIDxhIGhyZWY9Imh0dHBzOi8vc3RhdHMuYmlvcGFweXJ1cy5qcC9yL2Jhc2ljL2FwcGx5Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5hcHBseemWouaVsDwvYT4KCiMjIyDljZjoqp7mg4XloLHjga7kuIDpg6jjgpLmir3lh7oKYGBge3J9CnRtcDwtcm93bmFtZXMoZnJlcURhdGEpWzE6MTBdCmBgYAoKIyMjIGxhcHBseemWouaVsAoqIHBhc3RlCmBgYHtyfQpwYXN0ZSh0bXAsICJAR1NMQzIwMjEiKQpgYGAKIyMjIGxhcHBseQpgYGB7cn0KbGFwcGx5KHRtcCwgcGFzdGUsICJAR1NMQzIwMjEiKQpgYGAKCiMjIyBzYXBwbHk65ZCN5YmN44Gu5bGe5oCn5LuY44GNCmBgYHtyfQpzYXBwbHkodG1wLCBwYXN0ZSwgIkBHU0xDMjAyMSIpCmBgYAojIyMgYXBwbHnplqLmlbAKIyMjIyDnspfpoLvluqbjgajnm7jlr77poLvluqbjgpJjYmluZOOBp+e1kOWQiApgYGB7cn0KdG1wTXR4IDwtIGNiaW5kKGZyZXFEYXRhLCBmcmVxRGF0YSRGcmVxL3N1bShmcmVxRGF0YSRGcmVxKSkKY29sbmFtZXModG1wTXR4KTwtIGMoIkZyZXEiLCAiUmVsYXRpdkZyZXEiKQpoZWFkKHRtcE10eCkKYGBgCiMjIyBBcHBseSBvbiByb3dzCmBgYHtyfQpyZXMgPC0gYXBwbHkodG1wTXR4LDEsc3VtKQpoZWFkKHJlcykKYGBgCiMjIyBBcHBseSBvbiBjb2x1bW5zCmBgYHtyfQphcHBseSh0bXBNdHgsMixzdW0pCmBgYAojIyMgQXBwbHkgb24gZWxlbWVudHMKYGBge3J9CnJlcyA8LSBhcHBseSh0bXBNdHgsYygxLDIpLCBmdW5jdGlvbih4KSB4KjEwKQpoZWFkKHJlcykKYGBgCgojIyBSTWVDYWIKIyMjIFJNZUNhYuOCpOODs+OCueODiOODvOODq++8iOWIpeODleOCoeOCpOODq+WPgueFp++8iQoKYGBge3J9CnBhdGhfaG9tZSA8LSBzeXN0ZW0oImVjaG8gJEhPTUUiLGludGVybj1UKQpsaWJfcGF0aCA8LSBwYXN0ZShwYXRoX2hvbWUsIi91c3IvbG9jYWwvbGliL2xpYm1lY2FiLnNvLjIiLCBzZXA9IiIpCmBgYAoKIyMjIOODqeOCpOODluODqeODquODvCbjg5Hjg4PjgrHjg7zjgrjjga7oqq3jgb/ovrzjgb8KYGBge3J9CmR5bi5sb2FkKGxpYl9wYXRoKQpsaWJyYXJ5KFJNZUNhYikKYGBgCgojIyDliIbjgYvjgaHmm7jjgY3vvIbpoLvluqbooajvvIjjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6vkvb/nlKjvvIkKYGBge3J9CmZyZXFOZXdzMTwtUk1lQ2FiRnJlcSgibmhrX25ld3MvbmhrX25ld3NfMS50eHQiKQpmcmVxTmV3czEgPC1mcmVxTmV3czFbb3JkZXIoZnJlcU5ld3MxJEZyZXEsIGRlY3JlYXNpbmcgPSBUUlVFKSxdCmhlYWQoZnJlcU5ld3MxKQpgYGAKCiMjIyDvvIjlkIzkuIDjg4fjgqPjg6zjgq/jg4jjg6rlhoXvvInopIfmlbDjg5XjgqHjgqTjg6vjgYvjgonpoLvluqbooYzliJfjgpLkvZzmiJAKKyA8YSBocmVmPSJodHRwOi8vcm1lY2FiLmpwL3dpa2kvaW5kZXgucGhwP1JNZUNhYkZ1bmN0aW9ucyN2NjZiYjIzMyIgdGFyZ2V0PSJfYmxhbmsiPlJNZUNhYkZ1bmN0aW9uczwvYT4KYGBge3J9CnJlczEgPC0gZG9jTWF0cml4KCJuaGtfbmV3cyIsICBwb3MgPSBjKCLlkI3oqZ4iLCLliqnoqZ4iKSkKaGVhZChyZXMxKQpgYGAKCiMjIG1pbkZyZXEKYGBge3J9CnJlczIgPC0gZG9jTWF0cml4KCJuaGtfbmV3cyIsICBwb3MgPSBjKCLlkI3oqZ4iLCLliqnoqZ4iLCLli5XoqZ4iKSAsIG1pbkZyZXE9NSkKaGVhZChyZXMyKQpgYGAKCiMjIyBUZXJtIEZyZXF1ZW5jeS1JbnZlcnNlIERvY3VtZW50IEZyZXF1ZW5jeQorIOODhuOCreOCueODiOeJueacieOBruWHuuePvuWNmOiqnuOBq+WvvuOBl+OBpuOAgemHjeOBv+OBpeOBkeOCkuOBmeOCiworIOODhuOCreOCueODiOOBq+WFsemAmuOBmeOCi+WNmOiqnuOBq+WvvuOBl+OBpuOBr++8iOS9juOBhO+8iemHjeOBv+S7mOOBkQoKIyMjIFRGLUlERiAxCiQkdz10Zipsb2coXGZyYWN7Tn17ZGZ9KSAkJAoKIyMjIFRGLUlERiAyCiQkdz10ZioobG9nKFxmcmFje059e2RmfSkrMSkgJCQKCiMjIFJNZUNhYuOBrumWouaVsOOCkuWIqeeUqApgYGB7cn0KcmVzMyA8LSBkb2NNYXRyaXgoIm5oa19uZXdzIiwgIHBvcyA9IGMoIuWQjeipniIsIuWKqeipniIsIuWLleipniIpICwgbWluRnJlcT01LCB3ZWlnaHQgPSAidGYqaWRmIikKaGVhZChyZXMzKQpgYGAK