source("getFreqMtxDir.R")
res <- getFreqMtxDir("../univ")
res[1:3, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
res[rownames(res) == "000", ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
res[(rownames(res) == "000") | (rownames(res) == "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
res[rownames(res) %in% c("000", "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
noise <- c("a", "an", "the")
res[rownames(res) %in% noise, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## an 0 1 7 2 1 0 5
## the 7 25 36 16 31 52 39
noise <- readLines("noise.txt")
noise
## [1] "a" "an" "the" "that" "s" "ve"
res[rownames(res) %in% noise, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## an 0 1 7 2 1 0 5
## s 2 0 3 3 6 0 7
## that 0 13 8 2 7 10 7
## the 7 25 36 16 31 52 39
## ve 0 0 0 1 0 0 0
res[!rownames(res) %in% noise, ]
文頭:^
文末:$
任意の文字:.
直前の文字を一回以上出現:+
直前の文字を0または1回出現:?
tmp <- res[grep(rownames(res), pattern = "th[e|o]se"), ]
res[rownames(res) %in% rownames(tmp), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## these 0 0 1 1 2 0 7
## those 0 0 1 0 0 0 0
tmp <- res[grep(rownames(res), pattern = "class(es)?"), ]
res[rownames(res) %in% rownames(tmp), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## class 1 1 1 0 0 0 0
## classes 0 0 0 0 0 0 2
出力結果
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## continue 1 0 0 0 5 1 1
## continued 1 0 0 0 0 1 0
## continues 0 0 0 0 1 0 0
出力結果
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## high 0 0 3 0 0 0 0
## higher 0 0 1 0 0 0 1
## highest 0 0 0 0 0 0 1
tmp <- res[grep(rownames(res), pattern = "^[[:digit:]]"), ]
rownames(tmp)
res[rownames(res) %in% rownames(tmp), ]
res[!rownames(res) %in% rownames(tmp), ]
tmp <- res[grep(rownames(res), pattern = "^[[:alpha:]]"), ]
rownames(tmp)
res[rownames(res) %in% rownames(tmp), ]
tmp <- res[grep(rownames(res), pattern = "ly$"), ]
rownames(tmp)
## [1] "additionally" "appropriately" "certainly"
## [4] "clearly" "collaboratively" "consistently"
## [7] "crucially" "culturally" "currently"
## [10] "daily" "deeply" "developmentally"
## [13] "domestically" "early" "effectively"
## [16] "equally" "especially" "eventually"
## [19] "fairly" "genuinely" "globally"
## [22] "historically" "increasingly" "internationally"
## [25] "keenly" "largely" "lastly"
## [28] "locally" "nearly" "only"
## [31] "previously" "really" "resolutely"
## [34] "scholarly" "seemingly" "simultaneously"
## [37] "sincerely" "spiritually" "steadily"
## [40] "truly" "unlikely" "widely"
# res[rownames(res) %in% rownames(tmp),]
出力結果(一部抜粋)
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## abounding 0 0 0 0 0 0 1
## according 0 0 1 0 0 0 0
## allowing 0 0 1 0 0 0 0
## beckoning 0 0 0 0 1 0 0
## being 0 0 1 0 0 1 0
出力結果
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## designed 0 0 0 0 0 0 2
## determined 0 0 0 0 0 2 0
## developed 0 0 0 0 1 0 1
## dignified 0 0 1 0 0 0 0
## distinguished 0 1 0 0 0 0 1
df <- apply(res, 1, function(x) length(x[x > 0]))
df[1:5]
## 000 1 10 100 11
## 1 2 2 1 1
df[df == 7]
## a and as for in is
## 7 7 7 7 7 7
## not of the to university which
## 7 7 7 7 7 7
## with world
## 7 7
names(df[df == 7])
## [1] "a" "and" "as" "for" "in"
## [6] "is" "not" "of" "the" "to"
## [11] "university" "which" "with" "world"
res[rownames(res) %in% names(df[df == 7]), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## and 7 18 38 15 26 22 37
## as 2 4 7 2 8 6 8
## for 4 1 9 7 3 3 5
## in 4 20 26 4 11 4 32
## is 3 12 6 4 2 10 10
## not 1 1 2 2 1 2 1
## of 5 18 35 10 26 38 34
## the 7 25 36 16 31 52 39
## to 11 17 26 11 17 15 35
## university 6 5 16 18 21 9 22
## which 1 4 4 1 2 2 4
## with 1 8 3 4 3 5 13
## world 1 6 2 1 2 4 3
出力結果(一部抜粋)
res[rownames(res) %in% names(df[df <= 2]), ][1:5, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
## 100 0 0 0 0 1 0 0
## 11 2 0 0 0 0 0 0
出力結果(一部抜粋)
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## academic 0 1 6 0 4 4 2
## an 0 1 7 2 1 0 5
出力結果(一部抜粋)
## [1] 47
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## academic 0 1 6 0 4 4 2
## an 0 1 7 2 1 0 5
## and 7 18 38 15 26 22 37
## are 0 4 5 1 0 3 5
hclust(dist(data), methods=”ward”)
euclidean, canberra, manhattan, etc.
ward, complete, average, etc.
hc <- hclust(dist(t(res)))
plot(hc)
hc <- hclust(dist(t(res), method = "canberra"), method = "ward")
plot(hc)
names(df[(df >= 6)])
## [1] "a" "and" "as" "at" "be"
## [6] "can" "education" "for" "in" "is"
## [11] "it" "knowledge" "not" "of" "on"
## [16] "our" "research" "that" "the" "through"
## [21] "to" "university" "which" "with" "world"
res2 <- res[rownames(res) %in% names(df[df >= 6]), ]
hc <- hclust(dist(t(res2), method = "canberra"), method = "ward")
dist(t(res2), method = "canberra")
## hiroshima kufs kyoto osaka osakaNew tokyo
## kufs 15.296
## kyoto 16.566 9.536
## osaka 11.699 12.707 10.781
## osakaNew 13.486 10.465 8.549 10.305
## tokyo 14.081 8.747 7.612 9.571 6.719
## waseda 15.859 7.776 6.176 11.735 8.058 6.923
plot(hc)
rect.hclust(hc, k = 2, border = "red")
hc <- hclust(dist(res2, method = "canberra"), method = "ward")
plot(hc)
rect.hclust(hc, k = 3, border = "red")
library(manipulate)
manipulate({
hc <- hclust(dist(t(res2), method = dist_method), method = "ward")
plot(hc)
}, dist_method = picker("euclidean", "canberra", "manhattan", initial = "canberra"))
出力画面
manipulate({
mtx <- t(res2)
if (data == "words")
mtx <- res2
hc <- hclust(dist(mtx, method = dist_method), method = "ward")
plot(hc)
}, data = picker("text", "words"), dist_method = picker("euclidean", "canberra",
"manhattan"))
出力画面
出力画面
manipulate({
mtx <- t(res2)
if (data == "words")
mtx <- res2
hc <- hclust(dist(mtx, method = dist_method), method = "ward")
plot(hc)
if (rect == TRUE)
rect.hclust(hc, k = 3, border = "red")
}, data = picker("text", "words"), dist_method = picker("euclidean", "canberra",
"manhattan"), rect = checkbox(FALSE, "draw rectangles"))
manipulate({
mtx <- t(res2)
if (data == "words")
mtx <- res2
hc <- hclust(dist(mtx, method = dist_method), method = "ward")
plot(hc)
rect.hclust(hc, k = clust_num, border = "red")
}, data = picker("text", "words"), dist_method = picker("euclidean", "canberra",
"manhattan"), clust_num = slider(2, 5))
出力画面
出力画面