テキストファイル名を引数にして、単語の出現頻度と相対頻度の行列データを出力するgetFreqMtx関数を作成しなさい。
"osaka-u.txt"を使用して、正しく実行できるか確認すること。
getFreqMtx.R
getFreqMtx <- function(filename) {
txt <- readLines(filename, encoding = "utf8")
wordLst <- strsplit(txt, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
freq <- sort(table(wordLst), decreasing = TRUE)
relative <- round(freq/sum(freq), 3)
freqData <- data.frame(word = rownames(freq), freq = freq)
relativeData <- data.frame(word = rownames(relative), freq = relative)
freqMtx <- merge(freqData, relativeData, all = T, by = "word")
names(freqMtx) <- c("term", "raw", "relative")
freqOrder <- order(freqMtx$raw, decreasing = TRUE)
freqMtx <- freqMtx[freqOrder, ]
freqMtx$term <- as.character(freqMtx$term)
return(freqMtx)
}
freqMtx$term <- as.character(freqMtx$term)
res <- getFreqMtx("osaka-u.txt")
## term raw relative
## 261 the 41 0.060
## 23 and 33 0.048
## 183 of 30 0.044
## 281 university 28 0.041
## 194 osaka 23 0.033
## 270 to 23 0.033
## 9 a 15 0.022
## 26 as 13 0.019
## 127 in 12 0.017
## 93 for 10 0.015
変数relativeで分岐
relative = FALSE
freq <- sort(table(wordLst), decreasing = TRUE)
if (relative == TRUE) {
freq <- round(freq/sum(freq), 3)
}
getFreqMtx2.R
getFreqMtx2 <- function(filename, relative = FALSE) {
txt <- readLines(filename, encoding = "utf8")
wordLst <- strsplit(txt, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
freq <- sort(table(wordLst), decreasing = TRUE)
if (relative == TRUE) {
freq <- round(freq/sum(freq), 3)
}
freqMtx <- data.frame(word = rownames(freq), freq)
label = ""
fpathL = unlist(strsplit(filename, "/|\\."))
label = fpathL[length(fpathL) - 1]
names(freqMtx) <- c("term", label)
freqOrder <- order(freqMtx$freq, decreasing = TRUE)
freqMtx <- freqMtx[freqOrder, ]
freqMtx$term <- as.character(freqMtx$term)
rownames(freqMtx) <- seq(1:nrow(freqMtx))
return(freqMtx)
}
rownames(freqMtx) <- seq(1:nrow(freqMtx))
label = ""
fpathL = unlist(strsplit(filename, "/|\\."))
label = fpathL[length(fpathL) - 1]
names(freqData) <- c("term", label)
f = "../testData/test2.txt"
label = ""
fpathL = unlist(strsplit(f, "/|\\."))
label = fpathL[length(fpathL) - 1]
label
## [1] "test2"
source("getFreqMtx2.R")
getFreqMtx2("../testData/test2.txt")
## term test2
## 1 f 11
## 2 g 7
## 3 b 4
## 4 a 2
## 5 c 2
## 6 e 1
getFreqMtx2("../testData/test2.txt", relative = TRUE)
## term test2
## 1 f 0.407
## 2 g 0.259
## 3 b 0.148
## 4 a 0.074
## 5 c 0.074
## 6 e 0.037
res <- getFreqMtx2("osaka-u.txt")
dim(res)
## [1] 305 2
title = "Word Frequency Distribution"
xlabel = "Rank"
ylabel = "Frequency"
plot(rownames(res), res[, 2], pch = 8, col = "darkgreen", main = title, xlab = xlabel,
ylab = ylabel)
plot(rownames(res), res[, 2], xlim = c(1, nrow(res)), ylim = c(1, 100), log = "xy",
pch = 8, col = "darkgreen", main = title, xlab = xlabel, ylab = ylabel)
res$term[1]
## [1] "the"
nchar(res$term[1])
## [1] 3
lapply(res$term, nchar)
lapply(res$term, nchar)[1:3]
charlen <- unlist(lapply(res$term, nchar))
charlen[1:5]
## [1] 3 3 2 10 5
charlenF <- table(charlen)
title = "Word Length Frequency Distribution (Types)"
xlabel = "Word Length"
ylabel = "Frequency"
xmax = length(charlenF)
ymax = max(charlenF)
plot(charlenF, type = "b", pch = 8, col = "orange", xlim = c(1, xmax), ylim = c(1,
ymax), main = title, xlab = xlabel, ylab = ylabel)
\[ Frequency=\frac{K}{Rank^A} \] K,A: 定数
K = ymax
A = 0.75
zipf = unlist(lapply(rank, function(r) K/r^A))
## Error: 二項演算子の引数が数値ではありません
## Error: オブジェクト 'zipf' がありません
title = "Zipf's Law"
xlabel = "Rank"
ylabel = "Frequency"
plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, 100), ylim = c(1,
100), main = title, xlab = xlabel, ylab = ylabel)
## Error: オブジェクト 'zipf' がありません
par(new = T)
plot(rownames(res), res[, 2], xlim = c(1, nrow(res)), ylim = c(1, 100), log = "xy",
pch = 8, col = "darkgreen", main = title, xlab = xlabel, ylab = ylabel)
legend(20, 100, c("Frequency", "Zipf's\nlaw"), lty = c(NA, 1), pch = c(8, NA),
col = c("darkgreen", "red"))
library(manipulate)
manipulate({
plot(rownames(res), res[, 2], xlim = c(1, xmax), ylim = c(1, 100), log = "xy",
pch = 8, col = "darkgreen", main = title, xlab = xlabel, ylab = ylabel)
par(new = T)
zipf = unlist(lapply(rank, function(r) K/r^rangeA))
plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, xmax), ylim = c(1,
100), main = title, xlab = xlabel, ylab = ylabel)
}, xmax = slider(5, nrow(res), initial = 50), rangeA = slider(0.5, 1.5, initial = 1))
dirName <- "../testData"
files <- list.files(dirName)
files
## [1] "test1.txt" "test2.txt" "test3.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
## [1] "../testData/test1.txt" "../testData/test2.txt" "../testData/test3.txt"
freqLst <- lapply(filesDir, getFreqMtx2)
freqLst[1]
## [[1]]
## term test1
## 1 c 13
## 2 e 7
## 3 b 4
## 4 a 3
mtx <- freqLst[[1]]
for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")
mtx[is.na(mtx)] <- 0
mtx <- mtx[order(as.vector(mtx$term)), ]
mtx
## term test1 test2 test3
## 1 a 3 2 2
## 2 b 4 4 0
## 3 c 13 2 3
## 4 d 0 0 1
## 5 e 7 1 1
## 6 f 0 11 9
## 7 g 0 7 7
## 8 h 0 0 4
row.names(mtx) <- mtx[, 1]
mtx
## term test1 test2 test3
## a a 3 2 2
## b b 4 4 0
## c c 13 2 3
## d d 0 0 1
## e e 7 1 1
## f f 0 11 9
## g g 0 7 7
## h h 0 0 4
mtx <- mtx[-1]
mtx
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
write.csv(mtx, "testMtx.csv")
rowSums(mtx)
## a b c d e f g h
## 7 8 18 1 9 20 14 4
mtx[rowSums(mtx) >= 10, ]
## test1 test2 test3
## c 13 2 3
## f 0 11 9
## g 0 7 7
ただし、オプション引数のデフォルトは、素頻度行列が結果として出力されること。