relative=FALSE (raw frequency)
relative=TRUE (relative frequency)
tfidf=0 (なにもしない)
tfidf=1 (tf-idf1)
tfidf=2 (tf-idf2)
simi=”corr” (相関係数)
simi=”cov” (共分散係数)
simi=”cosine” (コサイン類似度)
今まで作成したgetFreqMtxDir等の関数を最大限有効活用しましょう
mtx <- round(simil(t(mtx), method = simi), 2)
if (simi == "cov" || simi == "cosine" || simi == "corr") {
mtx <- simi(mtx, simi)
}
smi <- round(simil(t(tf), method = simi), 2)
if (simi=="corr"){res <- cor(freqM)}
else if (simi=="cov"){res <- cov(freqM)}
else if (simi=="cosine"){res <- simil(t(freqM), method=simi, diag=T)}
simi=”correlation” (相関係数)
simi=”cov” (共分散係数)
simi=”cosine” (コサイン類似度)
calcSimi <- function(dirname, relative = FALSE, tfidf = 0, simi = "correlation") {
freqM <- getFreqMtxDir(dirname, relative, tfidf)
library(proxy)
if (simi == "cov") {
res <- cov(freqM)
} else if (simi == "correlation" || simi == "cosine") {
res <- simil(t(freqM), method = simi, diag = T)
}
return(round(res, 2))
}
source("getFreqMtxDir.R")
calcSimi("../testData")
## Attaching package: 'proxy'
## The following object(s) are masked from 'package:stats':
##
## as.dist, dist
## test1 test2 test3
## test1 NA
## test2 -0.29 NA
## test3 -0.38 0.80 NA
calcSimi("../testData", simi = "cov")
## test1 test2 test3
## test1 21.70 -5.16 -5.59
## test2 -5.16 14.84 9.70
## test3 -5.59 9.70 9.98
calcSimi("../testData", tfidf = 1, simi = "cosine")
## test1 test2 test3
## test1 NA
## test2 0.29 NA
## test3 0.00 0.68 NA
res <- calcSimi("../testData", simi = "cosine")
write.csv(res, "test.csv")
以下にエラー as.data.frame.default(x[[i]], optional = TRUE) :
cannot coerce class 'c("simil", "dist")' into a data.frame
res <- as.matrix(res)
write.csv(res, "test.csv")
summary(pr_DB)
## * Similarity measures:
## Braun-Blanquet, Chi-squared, correlation, cosine, Cramer, Dice,
## eJaccard, Fager, Faith, fJaccard, Gower, Hamman, Jaccard,
## Kulczynski1, Kulczynski2, Michael, Mountford, Mozley, Ochiai,
## Pearson, Phi, Phi-squared, Russel, simple matching, Simpson,
## Stiles, Tanimoto, Tschuprow, Yule, Yule2
##
## * Distance measures:
## Bhjattacharyya, Bray, Canberra, Chord, divergence, Euclidean,
## Geodesic, Hellinger, Kullback, Levenshtein, Mahalanobis,
## Manhattan, Minkowski, Podani, Soergel, supremum, Wave, Whittaker
行操作 ex. apply(x, 1, sum)
列操作 ex. apply(x, 2, sum)
各要素の平方根: apply(x,c(1,2),sqrt)
各要素の二乗: apply(x,c(1,2), function(z) z^2)
tf <- getFreqMtxDir("../univ")
# tf
colnames(tf)
## [1] "hiroshima" "kufs" "kyoto" "osaka" "osakaNew" "tokyo"
## [7] "waseda"
sum(tf[, 1])
## [1] 177
出力結果
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 177 524 749 306 534 500 928
## [1] 115
出力結果
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 115 256 331 163 249 218 403
\[ TTR=\frac{types}{tokens} \times 100 \] 出力結果
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 0.65 0.49 0.44 0.53 0.47 0.44 0.43
res <- getFreqMtxDir("../univ")
res[1:3, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
res[rownames(res) == "000", ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
res[(rownames(res) == "000") | (rownames(res) == "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
res[rownames(res) %in% c("000", "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
noise <- c("a", "an", "the")
res[rownames(res) %in% noise, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## an 0 1 7 2 1 0 5
## the 7 25 36 16 31 52 39
res[!rownames(res) %in% noise, ]
文頭:^
文末:$
任意の文字:.
直前の文字を一回以上:+
tmp <- res[grep(rownames(res), pattern = "^[[:digit:]]"), ]
rownames(tmp)
res[rownames(res) %in% rownames(tmp), ]
res[!rownames(res) %in% rownames(tmp), ]
tmp <- res[grep(rownames(res), pattern = "^[[:alpha:]]"), ]
rownames(tmp)
res[rownames(res) %in% rownames(tmp), ]
tmp <- res[grep(rownames(res), pattern = "ly$"), ]
rownames(tmp)
## [1] "additionally" "appropriately" "certainly"
## [4] "clearly" "collaboratively" "consistently"
## [7] "crucially" "culturally" "currently"
## [10] "daily" "deeply" "developmentally"
## [13] "domestically" "early" "effectively"
## [16] "equally" "especially" "eventually"
## [19] "fairly" "genuinely" "globally"
## [22] "historically" "increasingly" "internationally"
## [25] "keenly" "largely" "lastly"
## [28] "locally" "nearly" "only"
## [31] "previously" "really" "resolutely"
## [34] "scholarly" "seemingly" "simultaneously"
## [37] "sincerely" "spiritually" "steadily"
## [40] "truly" "unlikely" "widely"
# res[rownames(res) %in% rownames(tmp),]
df <- apply(res, 1, function(x) length(x[x > 0]))
df[1:5]
## 000 1 10 100 11
## 1 2 2 1 1
df[df == 7]
## a and as for in is
## 7 7 7 7 7 7
## not of the to university which
## 7 7 7 7 7 7
## with world
## 7 7
names(df[df == 7])
## [1] "a" "and" "as" "for" "in"
## [6] "is" "not" "of" "the" "to"
## [11] "university" "which" "with" "world"
出力結果
## a academic an and are as
## 7 5 5 7 5 7
## at be but can culture development
## 6 6 5 6 5 5
## education for future human i in
## 6 7 5 5 5 7
## is it japan knowledge not now
## 7 6 5 6 7 5
## of on one other our research
## 7 6 5 5 6 6
## s society such support that the
## 5 5 5 5 6 7
## their this through time to university
## 5 5 6 5 7 7
## we which will with world
## 5 7 5 7 7
出力結果(一部抜粋)
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## academic 0 1 6 0 4 4 2
## an 0 1 7 2 1 0 5
hclust(dist(data), methods=”ward”)
euclidean, canberra, manhattan, etc.
ward, complete, average, etc.
hc <- hclust(dist(t(res)))
plot(hc)
hc <- hclust(dist(t(res), method = "canberra"), method = "ward")
plot(hc)
names(df[(df >= 6)])
## [1] "a" "and" "as" "at" "be"
## [6] "can" "education" "for" "in" "is"
## [11] "it" "knowledge" "not" "of" "on"
## [16] "our" "research" "that" "the" "through"
## [21] "to" "university" "which" "with" "world"
res2 <- res[rownames(res) %in% names(df[df >= 6]), ]
hc <- hclust(dist(t(res2), method = "canberra"), method = "ward")
plot(hc)
hc <- hclust(dist(res2, method = "canberra"), method = "ward")
plot(hc)
library(manipulate)
manipulate({
hc <- hclust(dist(t(res2), method = dist_method), method = "ward")
plot(hc)
}, dist_method = picker("euclidean", "canberra", "manhattan"))
出力画面