source("getFreqDir.R")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
univTable <- getFreqDir("univ")
selectInput(inputId ="univName",
label = "Choose a text:",
choices = colnames(tf),
selected = colnames(tf)[4]),
currentText <- input$univName
textFreq<- tf[order(tf[,colnames(tf)==currentText],decreasing=TRUE),]
freq<-textFreq[,colnames(textFreq)==currentText][1:50]
label<-rownames(textFreq)[1:50]
+10分以上考えて分からない場合は、質問してください。
\[w=tf*log(\frac{N}{df}) \]
(testTF<- getFreqDir("testData"))
## test1 test2 test3 test4
## c 13 2 3 5
## e 7 1 1 2
## b 4 4 0 4
## a 3 2 2 4
## f 0 11 9 20
## g 0 7 7 14
## h 0 0 4 4
## d 0 0 1 1
colSums(testTF)
## test1 test2 test3 test4
## 27 27 27 54
testTF/colSums(testTF)
## test1 test2 test3 test4
## c 0.48148148 0.07407407 0.11111111 0.18518519
## e 0.25925926 0.03703704 0.03703704 0.07407407
## b 0.14814815 0.14814815 0.00000000 0.14814815
## a 0.05555556 0.03703704 0.03703704 0.07407407
## f 0.00000000 0.40740741 0.33333333 0.74074074
## g 0.00000000 0.25925926 0.25925926 0.51851852
## h 0.00000000 0.00000000 0.14814815 0.14814815
## d 0.00000000 0.00000000 0.01851852 0.01851852
sort(rowSums(testTF),decreasing=TRUE)
## f g c b e a h d
## 40 28 23 12 11 11 8 2
N<-ncol(testTF)
testDF<-apply(testTF, 1, function(x) length(x[x>0]) )
testWeighted<-testTF*log(N/testDF)
round(testWeighted,2)
## test1 test2 test3 test4
## c 0.00 0.00 0.00 0.00
## e 0.00 0.00 0.00 0.00
## b 1.15 1.15 0.00 1.15
## a 0.00 0.00 0.00 0.00
## f 0.00 3.16 2.59 5.75
## g 0.00 2.01 2.01 4.03
## h 0.00 0.00 2.77 2.77
## d 0.00 0.00 0.69 0.69
(testTFIDF <- testWeighted[rowSums(testWeighted)>0,])
## test1 test2 test3 test4
## b 1.150728 1.150728 0.0000000 1.1507283
## f 0.000000 3.164503 2.5891387 5.7536414
## g 0.000000 2.013775 2.0137745 4.0275490
## h 0.000000 0.000000 2.7725887 2.7725887
## d 0.000000 0.000000 0.6931472 0.6931472
\[Cos(x,y)= \frac{\sum x_{i} y_{i}}{\sqrt{\sum x_{i}^2\sum y_{i}^2}} \]
library(proxy)
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
round(simil(t(testTF), method="cosine"),2)
## test1 test2 test3
## test2 0.25
## test3 0.26 0.90
## test4 0.26 0.98 0.97
round(simil(t(testTFIDF), method="cosine"),2)
## test1 test2 test3
## test2 0.29
## test3 0.00 0.72
## test4 0.15 0.92 0.93
tf <- getFreqDir("univ")
View(tf)
top30<-sort(rowSums(tf),decreasing=TRUE)[1:30]
names(top30)
## [1] "the" "of" "and" "to" "in"
## [6] "university" "a" "is" "that" "as"
## [11] "osaka" "research" "with" "for" "its"
## [16] "education" "on" "it" "i" "students"
## [21] "s" "we" "by" "be" "will"
## [26] "this" "society" "knowledge" "are" "an"
hc <- hclust(dist(t(tf), method = "cosine"), method = "ward.D2")
plot(hc)
TFtop30<-tf[rownames(tf) %in% names(top30),]
hc <- hclust(dist(t(TFtop30), method = "cosine"), method = "ward.D2")
plot(hc)
set.seed(1209)
km.freq<-kmeans(TFtop30, centers=5, iter.max=100)
km.freq$cluster
## to and the university of a for
## 4 5 5 4 5 4 2
## in be is as i research s
## 4 2 3 3 2 2 2
## education it knowledge on with that students
## 2 2 2 2 3 3 2
## are will an this we its by
## 2 2 2 2 2 2 2
## society osaka
## 2 1
clusplot(TFtop30, km.freq$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)
clusplot(TFtop30, km.freq$cluster, color=TRUE, shade=TRUE, labels=2, lines=0, xlim=c(-1.8,-0.7),ylim=c(-0.7,1.0))
round(km.freq$centers,4)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## 1 0.0000 0.0000 0.0000 9.0000 18.0000 14.0000 0.0000 0.0000
## 2 0.9444 2.1111 5.2222 2.0000 2.3889 3.2222 3.5556 5.2778
## 3 1.5000 9.2500 6.0000 3.0000 5.0000 6.7500 7.5000 9.5000
## 4 6.2500 13.5000 22.2500 10.2500 15.7500 12.7500 9.5000 28.2500
## 5 6.3333 20.3333 36.3333 13.6667 27.6667 31.0000 36.6667 36.6667
boxplot(TFtop30[,4]~km.freq$cluster,data=TFtop30,col="lightblue")
boxplot(TFtop30[,8]~km.freq$cluster,data=TFtop30,col="lightblue")
source("getFreqDir.R")
res<-getFreqDir("univ")
head(res)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## to 11 17 26 11 17 11 15 35
## and 7 18 38 15 26 31 22 37
## the 7 25 36 16 31 32 50 39
## university 6 5 16 18 21 15 9 22
## hiroshima 5 0 0 0 0 0 0 0
## of 5 18 35 10 26 30 38 34
res<-getFreqDir("univ", tfidf=1)
head(res)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## to 0.00000 0 0 0 0 0 0 0
## and 0.00000 0 0 0 0 0 0 0
## the 0.00000 0 0 0 0 0 0 0
## university 0.00000 0 0 0 0 0 0 0
## hiroshima 10.39721 0 0 0 0 0 0 0
## of 0.00000 0 0 0 0 0 0 0