一行ずつ読み込んで、リストに格納
txt <- readLines("tufs.txt")
txt[1]
## [1] "These days, raising “global human resources” is being preached from various areas of industry, government, and academia. After entering the 21st century, the borderline between domestic and overseas markets has disappeared, people and objects move dramatically across national borders, and globalization is further progressing. With the economics, societies and cultures of various regions of the world being swallowed into globalization, in order to deal effectively with various situations, it is necessary to view these situations from above like a bird looking down from the sky. Global human resources have the ability to accurately overlook these situations in their entirety with a wide, global sense of view, and they are needed for this reason."
length(txt)
## [1] 5
wordL <- strsplit(txt, "[[:space:]]|[[:punct:]]")
wordL <- unlist(wordL)
wordL <- tolower(wordL)
wordL <- wordL[nchar(wordL) > 0]
wordL <- wordL[wordL != ""]
tokens <- length(wordL)
tokens
## [1] 548
types <- length(unique(wordL))
types
## [1] 244
\[ TTR=\frac{types}{tokens} \times 100 \]
types/tokens * 100
## [1] 44.53
freqL <- sort(table(wordL), decreasing = TRUE)
freqL[1:5]
## wordL
## of and the studies in
## 36 34 34 15 14
write.csv(freqL, "freq-tufs.csv")
\[ K=10000 \times \frac{(\sum m^2 \times freq(m)) -tokens}{tokens^2} \]
mFreq <- table(freqL)
tokens
## [1] 548
names(mFreq[3])
## [1] "3"
mFreq[3]
## 3
## 8
as.numeric(names(mFreq[3])) * mFreq[3]
## 3
## 24
\[ m^2 \times freq(m) \]
as.numeric(names(mFreq[3])) * mFreq[3]
## 3
## 24
m2 <- mapply(function(x, y) as.numeric(x)^2 * y, names(mFreq), mFreq)
\[ \sum( m^2 \times freq(m)) \]
sum(m2)
## [1] 5504
K <- 10000 * (sum(m2) - tokens)/tokens^2
K
## [1] 165