fLst<- list.files("nhk_news")
fLst
[1] "nhk_news_1.txt" "nhk_news_2.txt"
[3] "nhk_news_3.txt"
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
freqData <- getFreq("GSLC2021.txt")
dim(freqData)
[1] 271 1
head(freqData)
sum(freqData$Freq)
[1] 578
max(freqData$Freq)
[1] 37
min(freqData$Freq)
[1] 1
# Mean & Median
mean(freqData$Freq)
[1] 2.132841
median(freqData$Freq)
[1] 1
# Variance & Standard Deviation
var(freqData$Freq)
[1] 15.3971
sd(freqData$Freq)
[1] 3.923914
K=freqData[1,1]
A=0.8
rank <- seq(1:dim(freqData)[1])
zipf <- K/rank^A
配置:“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center”
ラベル
lty: 線の種類
pch: プロットの種類
#理論値
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqData)),ylim=c(1,50),main="Zipf's Law", xlab="Rank", ylab="Frequency")
#頻度散布図の重ね書き
par(new=T)
plot(rank,freqData[,1], xlim=c(1,nrow(freqData)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main="Zipf's Law", xlab="Rank", ylab="Frequency")
#凡例
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqData)),ylim=c(1,50),main="Zipf's Law", xlab="Rank", ylab="Frequency")
par(new=T)
plot(rank,freqData[,1], xlim=c(1,nrow(freqData)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main="Zipf's Law", xlab="Rank", ylab="Frequency")
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
tmp<-rownames(freqData)[1:10]
paste(tmp, "@GSLC2021")
[1] "the @GSLC2021" "and @GSLC2021"
[3] "of @GSLC2021" "in @GSLC2021"
[5] "to @GSLC2021" "language @GSLC2021"
[7] "research @GSLC2021" "culture @GSLC2021"
[9] "by @GSLC2021" "with @GSLC2021"
lapply(tmp, paste, "@GSLC2021")
[[1]]
[1] "the @GSLC2021"
[[2]]
[1] "and @GSLC2021"
[[3]]
[1] "of @GSLC2021"
[[4]]
[1] "in @GSLC2021"
[[5]]
[1] "to @GSLC2021"
[[6]]
[1] "language @GSLC2021"
[[7]]
[1] "research @GSLC2021"
[[8]]
[1] "culture @GSLC2021"
[[9]]
[1] "by @GSLC2021"
[[10]]
[1] "with @GSLC2021"
sapply(tmp, paste, "@GSLC2021")
the and
"the @GSLC2021" "and @GSLC2021"
of in
"of @GSLC2021" "in @GSLC2021"
to language
"to @GSLC2021" "language @GSLC2021"
research culture
"research @GSLC2021" "culture @GSLC2021"
by with
"by @GSLC2021" "with @GSLC2021"
tmpMtx <- cbind(freqData, freqData$Freq/sum(freqData$Freq))
colnames(tmpMtx)<- c("Freq", "RelativFreq")
head(tmpMtx)
res <- apply(tmpMtx,1,sum)
head(res)
the and of in to
37.06401 31.05363 28.04844 20.03460 16.02768
language
15.02595
apply(tmpMtx,2,sum)
Freq RelativFreq
578 1
res <- apply(tmpMtx,c(1,2), function(x) x*10)
head(res)
Freq RelativFreq
the 370 0.6401384
and 310 0.5363322
of 280 0.4844291
in 200 0.3460208
to 160 0.2768166
language 150 0.2595156
path_home <- system("echo $HOME",intern=T)
lib_path <- paste(path_home,"/usr/local/lib/libmecab.so.2", sep="")
dyn.load(lib_path)
library(RMeCab)
freqNews1<-RMeCabFreq("nhk_news/nhk_news_1.txt")
file = nhk_news/nhk_news_1.txt
length = 90
freqNews1 <-freqNews1[order(freqNews1$Freq, decreasing = TRUE),]
head(freqNews1)
res1 <- docMatrix("nhk_news", pos = c("名詞","助詞"))
file = nhk_news/nhk_news_1.txt
file = nhk_news/nhk_news_2.txt
file = nhk_news/nhk_news_3.txt
Term Document Matrix includes 2 information rows!
whose names are [[LESS-THAN-1]] and [[TOTAL-TOKENS]]
if you remove these rows, run
result[ rownames(result) != "[[LESS-THAN-1]]" , ]
result[ rownames(result) != "[[TOTAL-TOKENS]]" , ]
head(res1)
docs
terms nhk_news_1.txt nhk_news_2.txt
. 0 1
[[LESS-THAN-1]] 0 0
[[TOTAL-TOKENS]] 154 192
% 0 1
1 1 0
2 0 0
docs
terms nhk_news_3.txt
. 0
[[LESS-THAN-1]] 0
[[TOTAL-TOKENS]] 148
% 0
1 2
2 1
res2 <- docMatrix("nhk_news", pos = c("名詞","助詞","動詞") , minFreq=5)
file = nhk_news/nhk_news_1.txt
file = nhk_news/nhk_news_2.txt
file = nhk_news/nhk_news_3.txt
Term Document Matrix includes 2 information rows!
whose names are [[LESS-THAN-5]] and [[TOTAL-TOKENS]]
if you remove these rows, run
result[ rownames(result) != "[[LESS-THAN-5]]" , ]
result[ rownames(result) != "[[TOTAL-TOKENS]]" , ]
head(res2)
docs
terms nhk_news_1.txt nhk_news_2.txt
[[LESS-THAN-5]] 95 98
[[TOTAL-TOKENS]] 154 192
アメリカ 0 0
いる 0 7
が 0 11
カブトムシ 5 0
docs
terms nhk_news_3.txt
[[LESS-THAN-5]] 78
[[TOTAL-TOKENS]] 148
アメリカ 6
いる 0
が 8
カブトムシ 0
\[w=tf*log(\frac{N}{df}) \]
\[w=tf*(log(\frac{N}{df})+1) \]
res3 <- docMatrix("nhk_news", pos = c("名詞","助詞","動詞") , minFreq=5, weight = "tf*idf")
file = nhk_news/nhk_news_1.txt
file = nhk_news/nhk_news_2.txt
file = nhk_news/nhk_news_3.txt
head(res3)
docs
terms nhk_news_1.txt nhk_news_2.txt
アメリカ 0.000000 0.000000
いる 0.000000 18.094738
が 0.000000 17.434588
カブトムシ 12.924813 0.000000
する 9.509775 9.509775
ツイッター 0.000000 0.000000
docs
terms nhk_news_3.txt
アメリカ 15.509775
いる 0.000000
が 12.679700
カブトムシ 0.000000
する 0.000000
ツイッター 15.509775