Building blocks to create atomic character vectors

Moby Dick, Herman Melville : moby.words.v

text.v <- scan("http://www.gutenberg.org/files/2701/2701-0.txt", what="character", encoding="UTF-8", sep="\n")
start.v <- 537L
end.v <-  18927L
start.boilerplate.v <- text.v[1:537-1] 
end.boilerplate.v <- text.v[(18927+1):length(text.v)]
novel.lines.v <- text.v[start.v:end.v] 
novel.v <- paste(novel.lines.v, collapse=" ") 
novel.lower.v <- tolower(novel.v)
moby.words.l <- strsplit(novel.lower.v, "\\W")
moby.words.v <- unlist(moby.words.l)
moby.words.v <- moby.words.v[which(moby.words.v != "")]
View(moby.words.v)

Around the world in 80 days, Jules Verne: eighty.words.v

text1.v <- scan("http://www.gutenberg.org/cache/epub/103/pg103.txt", what="character", encoding="UTF-8", sep="\n")
View(text1.v)
start.v <- 77L
end.v <- 6145L
start.boilerplate.v <- text1.v[1:77-1] 
end.boilerplate.v <- text1.v[(6145+1):length(text1.v)]
novel.lines.v <- text1.v[start.v:end.v] 
novel.v <- paste(novel.lines.v, collapse=" ") 
novel.lower.v <- tolower(novel.v)
eighty.words.l <- strsplit(novel.lower.v, "\\W")
eighty.words.v <- unlist(eighty.words.l)
eighty.words.v <- eighty.words.v[which(eighty.words.v != "")]
View(eighty.words.v)

Peter Pan, JM Barrie: peter.words.v

text2.v <- scan("http://www.gutenberg.org/files/16/16-0.txt", what="character", encoding="UTF-8", sep="\n")
View(text2.v)
start.v <- 39
end.v <- 4480
start.boilerplate.v <- text2.v[1:39-1] 
end.boilerplate.v <- text2.v[(4480+1):length(text1.v)]
novel.lines.v <- text2.v[start.v:end.v] 
novel.v <- paste(novel.lines.v, collapse=" ") 
novel.lower.v <- tolower(novel.v)
peter.words.l <- strsplit(novel.lower.v, "\\W")
peter.words.v <- unlist(eighty.words.l)
peter.words.v <- eighty.words.v[which(eighty.words.v != "")]
View(peter.words.v)
  1. using the same code as above, please compare for the three texts

Number of tokens

length(moby.words.v)
## [1] 214942
length(peter.words.v)
## [1] 63760
length(eighty.words.v)
## [1] 63760

Number of types

length(unique(moby.words.v))
## [1] 17058
length(unique(peter.words.v))
## [1] 6823
length(unique(eighty.words.v))
## [1] 6823

Simple TTR

TTR.eighty <- length(unique(eighty.words.v))*100/length(eighty.words.v) # Types/Tokens
TTR.moby<- length(unique(moby.words.v))*100/length(moby.words.v)
TRR.peter<- length(unique(peter.words.v))*100/length(peter.words.v)
TTR <- c(TTR.eighty, TTR.moby, TRR.peter)
names(TTR) <- c("Eighty Days", "Moby Dick", "Peter") # assign names to vector TTR
barplot(TTR, main="Simple TTR")

TTR
## Eighty Days   Moby Dick       Peter 
##   10.701066    7.936094   10.701066

Write code to sort all types by frequency

Moby Dick:

moby.freqs.t <- table(moby.words.v) # Table of types w/ no. of tokens, alphabetically ordered
sorted.moby.freqs.t <- sort(moby.freqs.t, decreasing=TRUE) # ordered by frequency
sorted.moby.freqs.t <- sort(table(moby.words.v), decreasing= TRUE) # same again, more elegant (two in one)

Eighty Days:

eighty.freqs.t <- table(moby.words.v)
sorted.eighty.freqs.t <- sort(moby.freqs.t, decreasing=TRUE) 
sorted.eighty.freqs.t <- sort(table(eighty.words.v), decreasing= TRUE)

Peter Pan:

sorted.peter.freqs.t <- sort(table(peter.words.v), decreasing= TRUE)

Plot the 20 most frequent words for Moby Dick, Eighty Days and Peter Pan

Moby Dick:

twenty.mostfreq.t <- sorted.moby.freqs.t[1:20]
plot(twenty.mostfreq.t)

Eighty Days

twenty.mostfreq2.t <- sorted.eighty.freqs.t[1:20]
plot(twenty.mostfreq2.t)

Peter Pan

twenty.mostfreq3.t <- sorted.peter.freqs.t[1:20]
plot(twenty.mostfreq3.t)