Compare Corpora

Building blocks to create atomic character vectors

Moby Dick, Herman Melville : moby.words.v

text.v <- scan("http://www.gutenberg.org/files/2701/2701-0.txt", what="character", encoding="UTF-8", sep="\n")
start.v <- 537L
end.v <-  18927L
start.boilerplate.v <- text.v[1:537-1] 
end.boilerplate.v <- text.v[(18927+1):length(text.v)]
novel.lines.v <- text.v[start.v:end.v] 
novel.v <- paste(novel.lines.v, collapse=" ") 
novel.lower.v <- tolower(novel.v)
moby.words.l <- strsplit(novel.lower.v, "\\W")
moby.words.v <- unlist(moby.words.l)
moby.words.v <- moby.words.v[which(moby.words.v != "")]
View(moby.words.v)

Around the world in 80 days, Jules Verne: eighty.words.v

text1.v <- scan("http://www.gutenberg.org/cache/epub/103/pg103.txt", what="character", encoding="UTF-8", sep="\n")
View(text1.v)
start.v <- 77L
end.v <- 6145L
start.boilerplate.v <- text1.v[1:77-1] 
end.boilerplate.v <- text1.v[(6145+1):length(text1.v)]
novel.lines.v <- text1.v[start.v:end.v] 
novel.v <- paste(novel.lines.v, collapse=" ") 
novel.lower.v <- tolower(novel.v)
eighty.words.l <- strsplit(novel.lower.v, "\\W")
eighty.words.v <- unlist(eighty.words.l)
eighty.words.v <- eighty.words.v[which(eighty.words.v != "")]
View(eighty.words.v)

Peter Pan, JM Barrie: peter.words.v

text2.v <- scan("http://www.gutenberg.org/files/16/16-0.txt", what="character", encoding="UTF-8", sep="\n")
View(text2.v)
start.v <- 39
end.v <- 4480
start.boilerplate.v <- text2.v[1:39-1] 
end.boilerplate.v <- text2.v[(4480+1):length(text1.v)]
novel.lines.v <- text2.v[start.v:end.v] 
novel.v <- paste(novel.lines.v, collapse=" ") 
novel.lower.v <- tolower(novel.v)
peter.words.l <- strsplit(novel.lower.v, "\\W")
peter.words.v <- unlist(eighty.words.l)
peter.words.v <- eighty.words.v[which(eighty.words.v != "")]
View(peter.words.v)

using the same code as above, please compare for the three texts

number of tokens and types

Number of tokens

length(moby.words.v)

## [1] 214942

length(peter.words.v)

## [1] 63760

length(eighty.words.v)

## [1] 63760

Number of types

length(unique(moby.words.v))

## [1] 17058

length(unique(peter.words.v))

## [1] 6823

length(unique(eighty.words.v))

## [1] 6823

Simple TTR

TTR.eighty <- length(unique(eighty.words.v))*100/length(eighty.words.v) # Types/Tokens
TTR.moby<- length(unique(moby.words.v))*100/length(moby.words.v)
TRR.peter<- length(unique(peter.words.v))*100/length(peter.words.v)
TTR <- c(TTR.eighty, TTR.moby, TRR.peter)
names(TTR) <- c("Eighty Days", "Moby Dick", "Peter") # assign names to vector TTR
barplot(TTR, main="Simple TTR")

TTR

## Eighty Days   Moby Dick       Peter 
##   10.701066    7.936094   10.701066

Write code to sort all types by frequency

Moby Dick:

moby.freqs.t <- table(moby.words.v) # Table of types w/ no. of tokens, alphabetically ordered
sorted.moby.freqs.t <- sort(moby.freqs.t, decreasing=TRUE) # ordered by frequency
sorted.moby.freqs.t <- sort(table(moby.words.v), decreasing= TRUE) # same again, more elegant (two in one)

Eighty Days:

eighty.freqs.t <- table(moby.words.v)
sorted.eighty.freqs.t <- sort(moby.freqs.t, decreasing=TRUE) 
sorted.eighty.freqs.t <- sort(table(eighty.words.v), decreasing= TRUE)

Peter Pan:

sorted.peter.freqs.t <- sort(table(peter.words.v), decreasing= TRUE)

Plot the 20 most frequent words for `Moby Dick`, `Eighty Days` and `Peter Pan`

Moby Dick:

twenty.mostfreq.t <- sorted.moby.freqs.t[1:20]
plot(twenty.mostfreq.t)

Eighty Days

twenty.mostfreq2.t <- sorted.eighty.freqs.t[1:20]
plot(twenty.mostfreq2.t)

Peter Pan

twenty.mostfreq3.t <- sorted.peter.freqs.t[1:20]
plot(twenty.mostfreq3.t)

Compare Corpora

JTL

30 6 2020

Building blocks to create atomic character vectors

Number of tokens

Number of types

Simple TTR

Write code to sort all types by frequency

Plot the 20 most frequent words for `Moby Dick`, `Eighty Days` and `Peter Pan`

Compare Corpora

JTL

30 6 2020

Building blocks to create atomic character vectors

Number of tokens

Number of types

Simple TTR

Write code to sort all types by frequency

Plot the 20 most frequent words for Moby Dick, Eighty Days and Peter Pan

Plot the 20 most frequent words for `Moby Dick`, `Eighty Days` and `Peter Pan`