Moby Dick, Herman Melville : moby.words.v
text.v <- scan("http://www.gutenberg.org/files/2701/2701-0.txt", what="character", encoding="UTF-8", sep="\n")
start.v <- 537L
end.v <- 18927L
start.boilerplate.v <- text.v[1:537-1]
end.boilerplate.v <- text.v[(18927+1):length(text.v)]
novel.lines.v <- text.v[start.v:end.v]
novel.v <- paste(novel.lines.v, collapse=" ")
novel.lower.v <- tolower(novel.v)
moby.words.l <- strsplit(novel.lower.v, "\\W")
moby.words.v <- unlist(moby.words.l)
moby.words.v <- moby.words.v[which(moby.words.v != "")]
View(moby.words.v)
Around the world in 80 days, Jules Verne: eighty.words.v
text1.v <- scan("http://www.gutenberg.org/cache/epub/103/pg103.txt", what="character", encoding="UTF-8", sep="\n")
View(text1.v)
start.v <- 77L
end.v <- 6145L
start.boilerplate.v <- text1.v[1:77-1]
end.boilerplate.v <- text1.v[(6145+1):length(text1.v)]
novel.lines.v <- text1.v[start.v:end.v]
novel.v <- paste(novel.lines.v, collapse=" ")
novel.lower.v <- tolower(novel.v)
eighty.words.l <- strsplit(novel.lower.v, "\\W")
eighty.words.v <- unlist(eighty.words.l)
eighty.words.v <- eighty.words.v[which(eighty.words.v != "")]
View(eighty.words.v)
Peter Pan, JM Barrie: peter.words.v
text2.v <- scan("http://www.gutenberg.org/files/16/16-0.txt", what="character", encoding="UTF-8", sep="\n")
View(text2.v)
start.v <- 39
end.v <- 4480
start.boilerplate.v <- text2.v[1:39-1]
end.boilerplate.v <- text2.v[(4480+1):length(text1.v)]
novel.lines.v <- text2.v[start.v:end.v]
novel.v <- paste(novel.lines.v, collapse=" ")
novel.lower.v <- tolower(novel.v)
peter.words.l <- strsplit(novel.lower.v, "\\W")
peter.words.v <- unlist(eighty.words.l)
peter.words.v <- eighty.words.v[which(eighty.words.v != "")]
View(peter.words.v)
length(moby.words.v)
## [1] 214942
length(peter.words.v)
## [1] 63760
length(eighty.words.v)
## [1] 63760
length(unique(moby.words.v))
## [1] 17058
length(unique(peter.words.v))
## [1] 6823
length(unique(eighty.words.v))
## [1] 6823
TTR.eighty <- length(unique(eighty.words.v))*100/length(eighty.words.v) # Types/Tokens
TTR.moby<- length(unique(moby.words.v))*100/length(moby.words.v)
TRR.peter<- length(unique(peter.words.v))*100/length(peter.words.v)
TTR <- c(TTR.eighty, TTR.moby, TRR.peter)
names(TTR) <- c("Eighty Days", "Moby Dick", "Peter") # assign names to vector TTR
barplot(TTR, main="Simple TTR")
TTR
## Eighty Days Moby Dick Peter
## 10.701066 7.936094 10.701066
Moby Dick:
moby.freqs.t <- table(moby.words.v) # Table of types w/ no. of tokens, alphabetically ordered
sorted.moby.freqs.t <- sort(moby.freqs.t, decreasing=TRUE) # ordered by frequency
sorted.moby.freqs.t <- sort(table(moby.words.v), decreasing= TRUE) # same again, more elegant (two in one)
Eighty Days:
eighty.freqs.t <- table(moby.words.v)
sorted.eighty.freqs.t <- sort(moby.freqs.t, decreasing=TRUE)
sorted.eighty.freqs.t <- sort(table(eighty.words.v), decreasing= TRUE)
Peter Pan:
sorted.peter.freqs.t <- sort(table(peter.words.v), decreasing= TRUE)
Moby Dick, Eighty Days and Peter PanMoby Dick:
twenty.mostfreq.t <- sorted.moby.freqs.t[1:20]
plot(twenty.mostfreq.t)
Eighty Days
twenty.mostfreq2.t <- sorted.eighty.freqs.t[1:20]
plot(twenty.mostfreq2.t)
Peter Pan
twenty.mostfreq3.t <- sorted.peter.freqs.t[1:20]
plot(twenty.mostfreq3.t)