getwd()
## [1] "/cloud/project"
source("utils2.R")
freqData <- getFreq2("GSLC2021.txt")
head(freqData)
tmp<-rownames(freqData)[1:10]
paste(tmp, "@GSLC2021")
lapply(tmp, paste, "@GSLC2021")
sapply(tmp, paste, "@GSLC2021")
## the and of
## "the @GSLC2021" "and @GSLC2021" "of @GSLC2021"
## in to language
## "in @GSLC2021" "to @GSLC2021" "language @GSLC2021"
## research culture by
## "research @GSLC2021" "culture @GSLC2021" "by @GSLC2021"
## with
## "with @GSLC2021"
tmpMtx <- cbind(freqData, freqData$Freq/sum(freqData$Freq))
colnames(tmpMtx)<- c("Freq", "RelativFreq")
head(tmpMtx)
res <- apply(tmpMtx,1,sum)
head(res)
## the and of in to language
## 37.06401 31.05363 28.04844 20.03460 16.02768 15.02595
apply(tmpMtx,2,sum)
## Freq RelativFreq
## 578 1
res <- apply(tmpMtx,c(1,2), function(x) x*10)
head(res)
## Freq RelativFreq
## the 370 0.6401384
## and 310 0.5363322
## of 280 0.4844291
## in 200 0.3460208
## to 160 0.2768166
## language 150 0.2595156
dyn.load("/home/rstudio-user/usr/local/lib/libmecab.so.2")
library(RMeCab)
RMeCabC("すもももももももものうち")
## [[1]]
## 名詞
## "すもも"
##
## [[2]]
## 助詞
## "も"
##
## [[3]]
## 名詞
## "もも"
##
## [[4]]
## 助詞
## "も"
##
## [[5]]
## 名詞
## "もも"
##
## [[6]]
## 助詞
## "の"
##
## [[7]]
## 名詞
## "うち"
freqGSLC<-RMeCabFreq("Lec07_texts/ja/GSLC.txt")
## file = Lec07_texts/ja/GSLC.txt
## length = 275
head(freqGSLC)
freqGSLC <-freqGSLC[order(freqGSLC$Freq, decreasing = TRUE),]
head(freqGSLC)
library('wordcloud2')
wordcloud2(freqGSLC[c(1,4)])
*RMeCabFunctions: http://rmecab.jp/wiki/index.php?RMeCabFunctions#v66bb233
res <- docMatrix("Lec07_texts/ja", pos = c("名詞","助詞"))
## file = Lec07_texts/ja/GSJLC.txt
## file = Lec07_texts/ja/GSLC.txt
## file = Lec07_texts/ja/GSLS.txt
## Term Document Matrix includes 2 information rows!
## whose names are [[LESS-THAN-1]] and [[TOTAL-TOKENS]]
## if you remove these rows, run
## result[ rownames(result) != "[[LESS-THAN-1]]" , ]
## result[ rownames(result) != "[[TOTAL-TOKENS]]" , ]
res <- res[ rownames(res) != "[[LESS-THAN-1]]" , ]
res <- res[ rownames(res) != "[[TOTAL-TOKENS]]" , ]
dim(res)
## [1] 416 3
res <- res[order(rowSums(res), decreasing = TRUE),]
head(res)
## docs
## terms GSJLC.txt GSLC.txt GSLS.txt
## の 21 46 51
## を 20 27 40
## に 21 18 24
## 言語 9 15 31
## は 10 17 26
## 文化 23 13 13
res <- docMatrix("Lec07_texts/ja", pos = c("名詞","助詞","動詞") , minFreq=5)
## file = Lec07_texts/ja/GSJLC.txt
## file = Lec07_texts/ja/GSLC.txt
## file = Lec07_texts/ja/GSLS.txt
## Term Document Matrix includes 2 information rows!
## whose names are [[LESS-THAN-5]] and [[TOTAL-TOKENS]]
## if you remove these rows, run
## result[ rownames(result) != "[[LESS-THAN-5]]" , ]
## result[ rownames(result) != "[[TOTAL-TOKENS]]" , ]
res <- res[ rownames(res) != "[[LESS-THAN-5]]" , ]
res <- res[ rownames(res) != "[[TOTAL-TOKENS]]" , ]
res <- res[order(rowSums(res), decreasing = TRUE),]
head(res)
## docs
## terms GSJLC.txt GSLC.txt GSLS.txt
## の 21 46 51
## を 20 27 40
## する 15 18 35
## に 21 18 24
## 言語 9 15 31
## は 10 17 26
\[w=tf*log(\frac{N}{df}) \]
\[w=tf*(log(\frac{N}{df})+1) \]
res2 <- docMatrix("Lec07_texts/ja", pos = c("名詞","助詞","動詞") , minFreq=5, weight = "tf*idf")
## file = Lec07_texts/ja/GSJLC.txt
## file = Lec07_texts/ja/GSLC.txt
## file = Lec07_texts/ja/GSLS.txt
res2 <- res2[ rownames(res2) != "[[LESS-THAN-5]]" , ]
res2 <- res2[ rownames(res2) != "[[TOTAL-TOKENS]]" , ]
res2 <- res2[order(rowSums(res2), decreasing = TRUE),]
head(res2)
## docs
## terms GSJLC.txt GSLC.txt GSLS.txt
## の 21.00000 46.00000 51.00000
## を 20.00000 27.00000 40.00000
## する 15.00000 18.00000 35.00000
## に 21.00000 18.00000 24.00000
## 日本語 59.45414 0.00000 0.00000
## 言語 9.00000 15.00000 31.00000
print(res[rownames(res) == "日本語", ])
## GSJLC.txt GSLC.txt GSLS.txt
## 23 0 0
print(res2[rownames(res2) == "日本語", ])
## GSJLC.txt GSLC.txt GSLS.txt
## 59.45414 0.00000 0.00000
enMtx <- docMatrix("Lec07_texts/en")
## file = Lec07_texts/en/GSLC2021.txt
## file = Lec07_texts/en/sample_en.txt
## Term Document Matrix includes 2 information rows!
## whose names are [[LESS-THAN-1]] and [[TOTAL-TOKENS]]
## if you remove these rows, run
## result[ rownames(result) != "[[LESS-THAN-1]]" , ]
## result[ rownames(result) != "[[TOTAL-TOKENS]]" , ]
enMtx <- enMtx[ rownames(enMtx) != "[[LESS-THAN-5]]" , ]
enMtx <- enMtx[ rownames(enMtx) != "[[TOTAL-TOKENS]]" , ]
enMtx <- enMtx[order(rowSums(enMtx), decreasing = TRUE),]
head(enMtx)
## docs
## terms GSLC2021.txt sample_en.txt
## and 31 3
## the 30 1
## , 28 2
## of 28 0
## . 16 5
## in 17 1