Packages
library(stylo)
##
## ### stylo version: 0.7.4 ###
##
## If you plan to cite this software (please do!), use the following reference:
## Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
## a package for computational text analysis. R Journal 8(1): 107-121.
## <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>
##
## To get full BibTeX entry, type: citation("stylo")
Load parsed Isocrates’ speeches
corpus <- load.corpus(files = "all", corpus.dir = "corpus")
names(corpus) ## 21 text
## [1] "Aegineticus.txt" "AgainstCallimachus.txt" "AgainstEuthynus.txt"
## [4] "AgainstLochites.txt" "AgainsttheSophists.txt" "Antidosis.txt"
## [7] "Archidamus.txt" "Areopagiticus.txt" "Busiris.txt"
## [10] "Evagoras.txt" "Helen.txt" "Nicocles.txt"
## [13] "OnthePeace.txt" "Panathenaicus.txt" "Panegyricus.txt"
## [16] "Plataicus.txt" "TeamofHorses.txt" "ToDemonicus.txt"
## [19] "ToNicocles.txt" "ToPhilip.txt" "Trapeziticus.txt"
l <- c()
for(i in 1:length(corpus)){
x<-length(corpus[[i]])
l <- c(l,x)
}
sum(l) ## number of words in the corpus
## [1] 111861
Make random samples (1000 words) from all corpus
samples <- make.samples(corpus, sample.size = 1000, sampling = "random.sampling", sampling.with.replacement = TRUE, number.of.samples = 5)
length(samples) ## 21 * 5 = 105 samples
## [1] 105
Randomly distribute them between profiles
# Get 16 integers from 1 to 105
# Use max=106 because it will never actually equal 106
num1 <- floor(runif(16, min=1, max=106))
num1
## [1] 75 28 8 54 84 57 64 57 42 41 104 61 29 19 71 42
num2 <- floor(runif(16, min=1, max=106))
num2
## [1] 8 65 92 54 89 45 70 58 15 12 41 71 92 51 83 26
num3 <- floor(runif(16, min=1, max=106))
num3
## [1] 32 80 83 77 31 61 43 27 63 90 35 11 15 6 13 56
## create 6 texts by 3 Isocrates
Isoc1_Text1 <- samples[num1[1:8]]
Isoc1_Text2 <- samples[num1[9:16]]
Isoc2_Text1 <- samples[num2[1:8]]
Isoc2_Text2 <- samples[num2[9:16]]
Isoc3_Text1 <- samples[num3[1:8]]
Isoc3_Text2 <- samples[num3[9:16]]
## merge 8 elements of the list into one "text"
Isoc1_Text1 <- unlist(Isoc1_Text1, recursive = TRUE, use.names = FALSE)
Isoc1_Text2 <- unlist(Isoc1_Text2, recursive = TRUE, use.names = FALSE)
Isoc2_Text1 <- unlist(Isoc2_Text1, recursive = TRUE, use.names = FALSE)
Isoc2_Text2 <- unlist(Isoc2_Text2, recursive = TRUE, use.names = FALSE)
Isoc3_Text1 <- unlist(Isoc3_Text1, recursive = TRUE, use.names = FALSE)
Isoc3_Text2 <- unlist(Isoc1_Text1, recursive = TRUE, use.names = FALSE)
Write files
write.table(Isoc1_Text1, "Isoc1_Text1", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc1_Text2, "Isoc1_Text2", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc2_Text1, "Isoc2_Text1", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc2_Text2, "Isoc2_Text2", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc3_Text1, "Isoc3_Text1", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc3_Text2, "Isoc3_Text2", row.names = FALSE, col.names = FALSE, quote = FALSE)
Penalize
sp <- size.penalize(mfw = c(35, 70, 100), corpus.dir = "corpus_new",
sample.size.coverage = c(500, 1000, 1500),
iterations = 100, classification.method = "delta")
sp$accuracy.scores
## $Isoc1_Text1
## 500 1000 1500
## mfw_35 0.07 0.02 0
## mfw_70 0.02 0.00 0
## mfw_100 0.02 0.00 0
##
## $Isoc1_Text2
## 500 1000 1500
## mfw_35 0.28 0.24 0.34
## mfw_70 0.30 0.23 0.23
## mfw_100 0.29 0.26 0.24
##
## $Isoc2_Text1
## 500 1000 1500
## mfw_35 0.12 0.12 0.16
## mfw_70 0.13 0.10 0.17
## mfw_100 0.16 0.26 0.27
##
## $Isoc2_Text2
## 500 1000 1500
## mfw_35 0.15 0.21 0.12
## mfw_70 0.20 0.28 0.20
## mfw_100 0.30 0.39 0.30
##
## $Isoc3_Text1
## 500 1000 1500
## mfw_35 0 0 0
## mfw_70 0 0 0
## mfw_100 0 0 0
##
## $Isoc3_Text2
## 500 1000 1500
## mfw_35 0.01 0.00 0
## mfw_70 0.00 0.01 0
## mfw_100 0.00 0.00 0
##
## attr(,"description")
## [1] "accuracy scores for the tested texts"
Mean & SD
## get all values from accuracy matrices
vec <- c()
for(i in 1:length(sp$accuracy.scores)){
v <- as.vector(sp$accuracy.scores[[i]])
vec <- c(vec, v)
}
mean(vec)
## [1] 0.1148148
sd(vec)
## [1] 0.1227782