========================================================
source("getFreqMtx_v1.R")
freqMtx<-getFreqMtx1("msg-osakau.txt")
head(freqMtx)
## term raw relative
## 204 the 32 0.06438632
## 20 and 31 0.06237425
## 140 of 30 0.06036217
## 108 in 15 0.03018109
## 221 university 15 0.03018109
## 144 osaka 14 0.02816901
selectFreq1 <- function(Fmtx, relative=FALSE){
if (relative==TRUE){
res<-Fmtx[c(1,3)]
}else{
res<-Fmtx[1:2]
}
return(res)
}
selectFreq1(freqMtx)
selectFreq1(freqMtx,relative=FALSE)
selectFreq1(freqMtx,relative=TRUE)
dirName <- "testData"
files <- list.files(dirName)
files
## [1] "test1.txt" "test2.txt" "test3.txt" "test4.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
## [1] "testData/test1.txt" "testData/test2.txt" "testData/test3.txt"
## [4] "testData/test4.txt"
freqLst <- lapply(filesDir, getFreqMtx1)
rawfreqLst<-lapply(freqLst, selectFreq1)
files
## [1] "test1.txt" "test2.txt" "test3.txt" "test4.txt"
strsplit(files[1],"\\.")
## [[1]]
## [1] "test1" "txt"
unlist(strsplit(files[1],"\\."))[1]
## [1] "test1"
fnames<-unlist(lapply(files, function(x) unlist(strsplit(x,"\\."))[1]))
fnames
## [1] "test1" "test2" "test3" "test4"
fnames_b <-sapply(sapply(files, strsplit,"\\."),function(x)x[1])
#for(i in 1:length(rawfreqLst)) {colnames(rawfreqLst[[i]])[2]=fnames[i]}
tf <- rawfreqLst[[1]]
for (i in rawfreqLst[-1]) tf <- merge(tf, i, all = T, by = "term")
## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y' are duplicated in the result
tf
## term raw.x raw.y raw.x raw.y
## 1 a 3 2 2 4
## 2 b 4 4 NA 4
## 3 c 13 2 3 5
## 4 d NA NA 1 1
## 5 e 7 1 1 2
## 6 f NA 11 9 20
## 7 g NA 7 7 14
## 8 h NA NA 4 4
tf[is.na(tf)] <- 0
tf <- tf[order(as.vector(tf$term)), ]
row.names(tf) <- tf[, 1]
tf <- tf[-1]
colnames(tf) <- fnames
tf
## test1 test2 test3 test4
## a 3 2 2 4
## b 4 4 0 4
## c 13 2 3 5
## d 0 0 1 1
## e 7 1 1 2
## f 0 11 9 20
## g 0 7 7 14
## h 0 0 4 4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tf_b <- rawfreqLst[[1]]
for (i in rawfreqLst[-1]) tf_b <- full_join(tf_b, i, all = T, by = "term")
## Warning: Column `term` joining factors with different levels, coercing to
## character vector
## Warning: Column `term` joining character vector and factor, coercing into
## character vector
## Warning: Column `term` joining character vector and factor, coercing into
## character vector
tf_b[is.na(tf_b)] <- 0
row.names(tf_b) <- tf_b[, 1]
tf_b <- tf_b[-1]
colnames(tf_b) <- fnames
tf_b
## test1 test2 test3 test4
## c 13 2 3 5
## e 7 1 1 2
## b 4 4 0 4
## a 3 2 2 4
## f 0 11 9 20
## g 0 7 7 14
## h 0 0 4 4
## d 0 0 1 1
複数のテキストに共通して出現する単語の低く評価 ### TF-IDF 1 \[w=tf*log(\frac{N}{df}) \]
N<-ncol(tf)
df<-apply(tf, 1, function(x) length(x[x>0]) )
w<-round(tf*log(N/df),2)
w
## test1 test2 test3 test4
## a 0.00 0.00 0.00 0.00
## b 1.15 1.15 0.00 1.15
## c 0.00 0.00 0.00 0.00
## d 0.00 0.00 0.69 0.69
## e 0.00 0.00 0.00 0.00
## f 0.00 3.16 2.59 5.75
## g 0.00 2.01 2.01 4.03
## h 0.00 0.00 2.77 2.77
w[rowSums(w)>0,]
## test1 test2 test3 test4
## b 1.15 1.15 0.00 1.15
## d 0.00 0.00 0.69 0.69
## f 0.00 3.16 2.59 5.75
## g 0.00 2.01 2.01 4.03
## h 0.00 0.00 2.77 2.77
dirName <- "univ"
files <- list.files(dirName)
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
fnames<-unlist(lapply(files, function(x) unlist(strsplit(x,"\\."))[1]))
freqLst <- lapply(filesDir, getFreqMtx1)
#lapply(freqLst,head)
rawfreqLst<-lapply(freqLst, selectFreq1)
#lapply(rawfreqLst,head)
tf <- rawfreqLst[[1]]
for (i in rawfreqLst[-1]) tf <- merge(tf, i, all = T, by = "term")
## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y' are duplicated in the result
## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y' are duplicated in the result
## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y', 'raw.x', 'raw.y' are duplicated in the result
## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y', 'raw.x', 'raw.y' are duplicated in the result
## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y', 'raw.x', 'raw.y', 'raw.x', 'raw.y' are duplicated in the
## result
#for (i in rawfreqLst[-1]) tf_b <- full_join(tf_b, i, all = T, by = "term")
#head(tf)
tf[is.na(tf)] <- 0
tf <- tf[order(as.vector(tf$term)), ]
row.names(tf) <- tf[, 1]
tf <- tf[-1]
colnames(tf) <- fnames
head(tf)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## 000 0 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 0 1
## 10 0 0 1 0 0 0 0 1
## 100 0 0 0 0 1 0 0 0
## 11 2 0 0 0 0 0 0 0
## 12 1 0 0 0 0 0 0 0
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
#source("getFreqMtx_v1.R")
res<-getFreqMtx1("msg-osakau.txt")
row.names(res) <- res[, 1]
res <- res[-1]
freqMtx <- res[1]
head(res)
## raw relative
## the 32 0.06438632
## and 31 0.06237425
## of 30 0.06036217
## in 15 0.03018109
## university 15 0.03018109
## osaka 14 0.02816901
K=freqMtx[1,1]
A=0.8
rank <- seq(1:dim(freqMtx)[1])
zipf <- K/rank^A
library("manipulate")
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
A=0.8
rank <- seq(1:dim(freqMtx)[1])
manipulate(
{
zipf <- constK/rank^A
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
text(20, 50, "Frequency=K/Rank^A")
text(20, 40, paste("K=", constK))
text(20, 30, paste("A=", A))
}
, constK=slider(10,50, initial=freqMtx[1,1],step=2)
)
定数Aのスライダー 最小0, 最大1 ステップ:0.05 に設定
install.packages("shiny", dependencies = TRUE)
library(shiny)
shinyUI(bootstrapPage(
# Application title
headerPanel("Test Hoge"),
# Sidebar
sidebarPanel(
textInput("msg", "Please input your message:")
),
# Show a message
mainPanel(
textOutput("showMessage")
)
))
shinyServer(function(input, output) {
output$showMessage <- renderText({
input$msg
})
})
runApp("app_hoge")
runApp("app_hoge2")
runApp("app_hoge3")
runApp("app_zipf")
runApp("app_zipf_ans")
alt text
source("getFreqDir.R")
res <- getFreqDir("testData")
round(res,2)
## test1 test2 test3 test4
## a 3 2 2 4
## b 4 4 0 4
## c 13 2 3 5
## d 0 0 1 1
## e 7 1 1 2
## f 0 11 9 20
## g 0 7 7 14
## h 0 0 4 4
res1 <- getFreqDir("testData",relative=TRUE)
round(res1,2)
## test1 test2 test3 test4
## a 0.11 0.07 0.07 0.07
## b 0.15 0.15 0.00 0.07
## c 0.48 0.07 0.11 0.09
## d 0.00 0.00 0.04 0.02
## e 0.26 0.04 0.04 0.04
## f 0.00 0.41 0.33 0.37
## g 0.00 0.26 0.26 0.26
## h 0.00 0.00 0.15 0.07
res2 <- getFreqDir("testData",tfidf=TRUE)
round(res2,2)
## test1 test2 test3 test4
## b 1.15 1.15 0.00 1.15
## d 0.00 0.00 0.69 0.69
## f 0.00 3.16 2.59 5.75
## g 0.00 2.01 2.01 4.03
## h 0.00 0.00 2.77 2.77