source("getFreqDir.R")
tf<-getFreqDir("univ")
res1<-hclust(dist(t(tf)), method ="ward.D2")
plot(res1)
res2<-hclust(dist(t(tf)), method ="ward.D")
plot(res2)
plot(res1,xlab="euclidean distance", sub="ward method")
plot(hcus, xlab=paste("distance = ",input$choice1), sub=paste("clustering = ",input$choice2))
NB. hcusはhclust関数の結果を代入
複数のテキストに共通して出現する単語の低く評価 ### TF-IDF 1 \[w=tf*log(\frac{N}{df}) \]
tf<-getFreqDir("testData")
tf
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
tf[1,]
## test1 test2 test3
## a 3 2 2
tf[,1]
## [1] 3 4 13 0 7 0 0 0
tf[rownames(tf)=="e",]
## test1 test2 test3
## e 7 1 1
tf[rownames(tf)=="e" | rownames(tf)=="d",]
## test1 test2 test3
## d 0 0 1
## e 7 1 1
tf[rowSums(tf)>10,]
## test1 test2 test3
## c 13 2 3
## f 0 11 9
## g 0 7 7
tf[rowSums(tf)< 4,]
## test1 test2 test3
## d 0 0 1
tf[rowSums(tf)>=10,]
## test1 test2 test3
## c 13 2 3
## f 0 11 9
## g 0 7 7
tf[rowSums(tf)<=15,]
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## d 0 0 1
## e 7 1 1
## g 0 7 7
## h 0 0 4
tf[(rowSums(tf)>=10) & (rowSums(tf)<=15) ,]
## test1 test2 test3
## g 0 7 7
hoge=1
if (hoge==1 | hoge==2) {print("hoge = 1 or 2")}else{ print("hoge=??")}
## [1] "hoge = 1 or 2"
hoge=2
if (hoge==1 || hoge==2) {print("hoge = 1 or 2")}else{ print("hoge=??")}
## [1] "hoge = 1 or 2"
hoge=3
if (hoge==1 || hoge==2) {print("hoge = 1 or 2")}else{ print("hoge=??")}
## [1] "hoge=??"
N<-ncol(tf)
N
## [1] 3
df<-apply(tf, 1, function(x) length(x[x>0]) )
df
## a b c d e f g h
## 3 2 3 1 3 2 2 1
w<-round(tf*log(N/df),2)
w
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
\[w=tf*(log(\frac{N}{df})+1) \]
w<-round(tf*(log(N/df)+1),2)
w
## test1 test2 test3
## a 3.00 2.00 2.00
## b 5.62 5.62 0.00
## c 13.00 2.00 3.00
## d 0.00 0.00 2.10
## e 7.00 1.00 1.00
## f 0.00 15.46 12.65
## g 0.00 9.84 9.84
## h 0.00 0.00 8.39
calcTFIDF<-function(tf, type=1){
N<-ncol(tf)
idf<-apply(tf, 1, function(x) length(x[x>0]) )
if(type==1) {
w<-tf*log(N/idf)
}else if(type==2) {
w<-tf*(log(N/idf)+1)
}
return(w)
}
source("getFreqDir.R")
res <- getFreqDir("testData")
round(res,2)
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
res1 <- getFreqDir("testData",tfidf=1)
round(res1,2)
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
res2 <- getFreqDir("testData",tfidf=2)
round(res2,2)
## test1 test2 test3
## a 3.00 2.00 2.00
## b 5.62 5.62 0.00
## c 13.00 2.00 3.00
## d 0.00 0.00 2.10
## e 7.00 1.00 1.00
## f 0.00 15.46 12.65
## g 0.00 9.84 9.84
## h 0.00 0.00 8.39
library(shiny)
runApp("shiny_apps/app_hclust")