dyn.load("/home/rstudio-user/usr/local/lib/libmecab.so.2")
library(RMeCab)
\[w=tf*log(\frac{N}{df}) \]
\[w=tf*(log(\frac{N}{df})+1) \]
res2 <- docMatrix("Lec07_texts/ja", pos = c("名詞","助詞","動詞") , minFreq=5, weight = "tf*idf")
## file = Lec07_texts/ja/GSJLC.txt
## file = Lec07_texts/ja/GSLC.txt
## file = Lec07_texts/ja/GSLS.txt
res2 <- res2[ rownames(res2) != "[[LESS-THAN-5]]" , ]
res2 <- res2[ rownames(res2) != "[[TOTAL-TOKENS]]" , ]
res2 <- res2[order(rowSums(res2), decreasing = TRUE),]
head(res2)
## docs
## terms GSJLC.txt GSLC.txt GSLS.txt
## の 21.00000 46.00000 51.00000
## を 20.00000 27.00000 40.00000
## する 15.00000 18.00000 35.00000
## に 21.00000 18.00000 24.00000
## 日本語 59.45414 0.00000 0.00000
## 言語 9.00000 15.00000 31.00000
dim(res2)
## [1] 38 3
tf <- docMatrix("Lec07_texts/ja", pos = c("名詞","助詞","動詞") , minFreq=5)
## file = Lec07_texts/ja/GSJLC.txt
## file = Lec07_texts/ja/GSLC.txt
## file = Lec07_texts/ja/GSLS.txt
## Term Document Matrix includes 2 information rows!
## whose names are [[LESS-THAN-5]] and [[TOTAL-TOKENS]]
## if you remove these rows, run
## result[ rownames(result) != "[[LESS-THAN-5]]" , ]
## result[ rownames(result) != "[[TOTAL-TOKENS]]" , ]
tf <- tf[ rownames(tf) != "[[LESS-THAN-5]]" , ]
tf <- tf[ rownames(tf) != "[[TOTAL-TOKENS]]" , ]
tf <- tf[order(rowSums(tf), decreasing = TRUE),]
head(tf)
## docs
## terms GSJLC.txt GSLC.txt GSLS.txt
## の 21 46 51
## を 20 27 40
## する 15 18 35
## に 21 18 24
## 言語 9 15 31
## は 10 17 26
dim(tf)
## [1] 38 3
(N<-ncol(tf))
## [1] 3
DocFreq<-apply(tf, 1, function(x) length(x[x>0]) )
head(DocFreq,20)
## の を する に 言語 は 文化 と 研究 て が
## 3 3 3 3 3 3 3 3 3 3 3
## 専攻 で 日本語 年 日本 いる れる 教育 社会
## 3 3 1 2 1 2 3 2 1
tf_idf1 <- tf*log(N/DocFreq)
head(tf_idf1)
## docs
## terms GSJLC.txt GSLC.txt GSLS.txt
## の 0 0 0
## を 0 0 0
## する 0 0 0
## に 0 0 0
## 言語 0 0 0
## は 0 0 0
tf_idf1 <- tf_idf1[rowSums(tf_idf1)>0,]
head(tf_idf1)
## docs
## terms GSJLC.txt GSLC.txt GSLS.txt
## 日本語 25.268083 0.000000 0.000000
## 年 5.676512 2.027326 0.000000
## 日本 20.873633 0.000000 0.000000
## いる 2.027326 0.000000 4.865581
## 教育 2.432791 3.649186 0.000000
## 社会 0.000000 0.000000 15.380572
tf_idf2 <- tf*(log(N/DocFreq)+1)
head(tf_idf2)
## docs
## terms GSJLC.txt GSLC.txt GSLS.txt
## の 21 46 51
## を 20 27 40
## する 15 18 35
## に 21 18 24
## 言語 9 15 31
## は 10 17 26
print(res2[rownames(res2) == "日本語", ])
## GSJLC.txt GSLC.txt GSLS.txt
## 59.45414 0.00000 0.00000
print(tf_idf1[rownames(tf_idf1) == "日本語", ])
## GSJLC.txt GSLC.txt GSLS.txt
## 25.26808 0.00000 0.00000
print(tf_idf2[rownames(tf_idf2) == "日本語", ])
## GSJLC.txt GSLC.txt GSLS.txt
## 48.26808 0.00000 0.00000
tf_idf2bis <- tf*(log(N/DocFreq, base=2)+1)
print(tf_idf2bis[rownames(tf_idf2bis) == "日本語", ])
## GSJLC.txt GSLC.txt GSLS.txt
## 59.45414 0.00000 0.00000
source("getFreqDir.R")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
univTable <- getFreqDir("univ")
dim(univTable)
## [1] 1506 10
head(univTable)
univTable1 <- getFreqDir("univ", tfidf=1)
dim(univTable1)
## [1] 1487 10
head(univTable1)
library(shiny)
runApp("app_pch")