Packages

library(stylo)
## 
## ### stylo version: 0.7.4 ###
## 
## If you plan to cite this software (please do!), use the following reference:
##     Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
##     a package for computational text analysis. R Journal 8(1): 107-121.
##     <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>
## 
## To get full BibTeX entry, type: citation("stylo")

Load parsed Isocrates’ speeches

corpus <- load.corpus(files = "all", corpus.dir = "corpus")
names(corpus) ## 21 text
##  [1] "Aegineticus.txt"        "AgainstCallimachus.txt" "AgainstEuthynus.txt"   
##  [4] "AgainstLochites.txt"    "AgainsttheSophists.txt" "Antidosis.txt"         
##  [7] "Archidamus.txt"         "Areopagiticus.txt"      "Busiris.txt"           
## [10] "Evagoras.txt"           "Helen.txt"              "Nicocles.txt"          
## [13] "OnthePeace.txt"         "Panathenaicus.txt"      "Panegyricus.txt"       
## [16] "Plataicus.txt"          "TeamofHorses.txt"       "ToDemonicus.txt"       
## [19] "ToNicocles.txt"         "ToPhilip.txt"           "Trapeziticus.txt"
l <- c()
for(i in 1:length(corpus)){
  x<-length(corpus[[i]]) 
  l <- c(l,x)
}
sum(l) ## number of words in the corpus
## [1] 111861

Make random samples (1000 words) from all corpus

samples <- make.samples(corpus, sample.size = 1000, sampling = "random.sampling", sampling.with.replacement = TRUE, number.of.samples = 5)
length(samples) ## 21 * 5 = 105 samples
## [1] 105

Randomly distribute them between profiles

# Get 16 integers from 1 to 105
# Use max=106 because it will never actually equal 106
num1 <- floor(runif(16, min=1, max=106))
num1
##  [1]  75  28   8  54  84  57  64  57  42  41 104  61  29  19  71  42
num2 <- floor(runif(16, min=1, max=106))
num2
##  [1]  8 65 92 54 89 45 70 58 15 12 41 71 92 51 83 26
num3 <- floor(runif(16, min=1, max=106))
num3
##  [1] 32 80 83 77 31 61 43 27 63 90 35 11 15  6 13 56
## create 6 texts by 3 Isocrates
Isoc1_Text1 <- samples[num1[1:8]]
Isoc1_Text2 <- samples[num1[9:16]]
Isoc2_Text1 <- samples[num2[1:8]]
Isoc2_Text2 <- samples[num2[9:16]]
Isoc3_Text1 <- samples[num3[1:8]]
Isoc3_Text2 <- samples[num3[9:16]]

## merge 8 elements of the list into one "text"
Isoc1_Text1 <- unlist(Isoc1_Text1, recursive = TRUE, use.names = FALSE)
Isoc1_Text2 <- unlist(Isoc1_Text2, recursive = TRUE, use.names = FALSE)
Isoc2_Text1 <- unlist(Isoc2_Text1, recursive = TRUE, use.names = FALSE)
Isoc2_Text2 <- unlist(Isoc2_Text2, recursive = TRUE, use.names = FALSE)
Isoc3_Text1 <- unlist(Isoc3_Text1, recursive = TRUE, use.names = FALSE)
Isoc3_Text2 <- unlist(Isoc1_Text1, recursive = TRUE, use.names = FALSE)

Write files

write.table(Isoc1_Text1, "Isoc1_Text1", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc1_Text2, "Isoc1_Text2", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc2_Text1, "Isoc2_Text1", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc2_Text2, "Isoc2_Text2", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc3_Text1, "Isoc3_Text1", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(Isoc3_Text2, "Isoc3_Text2", row.names = FALSE, col.names = FALSE, quote = FALSE)

Penalize

sp <- size.penalize(mfw = c(35, 70, 100), corpus.dir = "corpus_new",
              sample.size.coverage = c(500, 1000, 1500),
              iterations = 100, classification.method = "delta")
sp$accuracy.scores
## $Isoc1_Text1
##          500 1000 1500
## mfw_35  0.07 0.02    0
## mfw_70  0.02 0.00    0
## mfw_100 0.02 0.00    0
## 
## $Isoc1_Text2
##          500 1000 1500
## mfw_35  0.28 0.24 0.34
## mfw_70  0.30 0.23 0.23
## mfw_100 0.29 0.26 0.24
## 
## $Isoc2_Text1
##          500 1000 1500
## mfw_35  0.12 0.12 0.16
## mfw_70  0.13 0.10 0.17
## mfw_100 0.16 0.26 0.27
## 
## $Isoc2_Text2
##          500 1000 1500
## mfw_35  0.15 0.21 0.12
## mfw_70  0.20 0.28 0.20
## mfw_100 0.30 0.39 0.30
## 
## $Isoc3_Text1
##         500 1000 1500
## mfw_35    0    0    0
## mfw_70    0    0    0
## mfw_100   0    0    0
## 
## $Isoc3_Text2
##          500 1000 1500
## mfw_35  0.01 0.00    0
## mfw_70  0.00 0.01    0
## mfw_100 0.00 0.00    0
## 
## attr(,"description")
## [1] "accuracy scores for the tested texts"

Mean & SD

## get all values from accuracy matrices
vec <- c()
for(i in 1:length(sp$accuracy.scores)){
  v <- as.vector(sp$accuracy.scores[[i]])
  vec <- c(vec, v)
}
mean(vec) 
## [1] 0.1148148
sd(vec)
## [1] 0.1227782