With reference from Free Range Statistics’ text analysis of Hamlet, there were 2 main issues that arose.
Line | Text |
---|---|
2748 | “[Enter King, Queen, Polonius, Ophelia, Rosencratz, and” |
2749 | “Guildenstern.]” |
This resulted in the last stage direction to capture only line 2748, resulting in an incomplete stage direction.
replace_na_with_num <- function(x){
for (i in 2:length(x)){
if(is.na(x[i])){
x[i] <- x[i-1] + 1
}
}
return(x)
}
stopifnot(replace_na_with_num(c(1,1,1,NA,NA,1,NA))== c(1,1,1,2,3,1,2))
hamlet <- gutenberg_download(1524) %>%
mutate(original_line_number = 1:n()) %>%
dplyr::select(-gutenberg_id)
Original:
hamlet %>% filter(original_line_number == 2748) %>% pull(text)
[1] "[Enter King, Queen, Polonius, Ophelia, Rosencrantz, and"
To rectify this issue, we first accounted for any spilling over in the original text by creating a new column to resolve any spilling over by using paste()
and lead()
. left_join()
was then used subsequently to retain the original structure.
hamlet <- hamlet %>% mutate(text = case_when(
grepl("^\\[.", text) ~ paste(text, lead(text))
)) %>% left_join(hamlet,by="original_line_number")
We then used coalesce()
to remove any missing values caused by this, and select()
the new text
column, retaining the original data structure.
hamlet$text <- coalesce(hamlet$text.x, hamlet$text.y)
hamlet <- hamlet %>% dplyr::select(text,original_line_number)
As a result, we see that the lines which had scenes spilling over are now accounted for.
# For Line 2748
hamlet %>% filter(original_line_number == 2748) %>% pull(text)
[1] "[Enter King, Queen, Polonius, Ophelia, Rosencrantz, and Guildenstern.]"
The issue with differing speeches was that there were certain characters that could not be captured grepl()
. The author’s current code resolves for most combinations of character names (“1 Ambassador”, “P. King.”), but it does not handle character names with Roman numerals like “I Player”.
To do so, we first include “I Player” into the variable speaker_abb
(speaker abbreviations).
main_chars <- c("Hamlet", "Horatio", "Claudius", "Gertrude", "Ophelia", "Polonius", "Laertes", "Ghost")
personae <- tribble(
~speaker, ~speaker_abb, ~speaker_sh,
"Claudius, King of Denmark.", "King.", "Claudius",
"Hamlet, Son to the former, and Nephew to the present King.", "Ham.", "Hamlet",
"Polonius, Lord Chamberlain.", "Pol.", "Polonius",
"Horatio, Friend to Hamlet.", "Hor.", "Horatio",
"Laertes, Son to Polonius.", "Laer.", "Laertes",
"Voltimand, Courtier.", "Volt.", "Voltimand",
"Cornelius, Courtier.", "???", "Cornelius",
"Rosencrantz, Courtier.", "Ros.", "Rosencrantz",
"Guildenstern, Courtier.", "Guil.", "Guildenstern",
"Osric, Courtier.", "Osr.", "Osric",
"A Gentleman, Courtier.", "Gent.", "Gentleman",
"Marcellus, Officer.", "Mar.", "Marcellus",
"Bernardo, Officer.", "Ber.", "Bernardo",
"Francisco, a Soldier", "Fran.", "Francisco",
"Reynaldo, Servant to Polonius.", "Rey.","Reynaldo",
"Players", "Players.", "Players",
"I Player.", 'I Play. ',"I Players.", #new addition
"Fortinbras, Prince of Norway.", "For.", "Fortinbras",
"Fortinbras, Prince of Norway.", "Fort.", "Fortinbras",
"A Captain.", "Capt.", "Captain",
"English Ambassador 1.", "1 Ambassador.", "English Ambassador",
"Ghost of Hamlet's Father.", "Ghost.", "Ghost",
"Gertrude, Queen of Denmark, and Mother of Hamlet.", "Queen.", "Gertrude",
"Ophelia, Daughter to Polonius.", "Oph.", "Ophelia",
"Prologue to The Murder of Gonzago, a play within a play", "Pro.", "Player prologue",
"King in The Murder of Gonzago, a play within a play", "P. King.", "Player King",
"Queen in The Murder of Gonzago, a play within a play", "P. Queen.", "Player Queen",
"Lucianus, nephew to the King in play within a play", "Luc.", "Lucianus",
"Danes", "Danes.", "Danes",
"Servant", "Servant.", "Servant",
"Sailor", "Sailor.", "Sailor",
"Messenger", "Mess.", "Messenger",
"First gravedigger clown", "1 Clown.", "First gravedigger",
"Second gravedigger clown", "2 Clown.", "Second gravedigger",
"First priest", "1 Priest.", "First Priest",
"Second priest", "2 Priest.", "Second Priest",
"Danish courtier lord", "Lord.", "Lord",
# There are 3 uses of 'All.', in each case mean different groups
"All present on stage", "All.", "All",
# only two uses of 'Both.', near the beginning:
"Marcellus and Bernardo together", "Both.", "Marcellus and Bernardo"
) %>%
mutate(
main_character = speaker_sh %in% main_chars
) #is he main character or not
not_people <- c("He.", "Say.", "Farewell.", "Perpend.",
"Nothing.", "Good.", "Swear.", "No.", "Dead.", "One.")
Lastly, we then have to include an additional grepl()
function to capture the new addition of “I Player” to speaker_abb
. To account for further name combinations, we also included recognising character names without white spaces. Lastly, we utilised fill()
and left_join()
back to our personae
dataframe at the end.
We also added further conditions by considering the speaker and the act as well, to rectify the issue of Hamlet ending the previous scene and starting the following scene.
hamlet_lines <- hamlet %>%
slice(-(1:39)) %>%
mutate(
speaker_abb = case_when(
# Most people picked up by this
grepl("^[A-Z][a-z]+\\.$", text) ~ text,
# New addition
# Other combinations
grepl("^[A-Z][a-z]+\\.", text) ~ text,
# "1 Ambassador", "2 Clown" and similar are picked up by this:
grepl("^[1-9]\\s[A-Z][a-z]+\\.$", text) ~ text,
# P. King. etc picked up by this:
grepl("^[A-Z]\\.\\s[A-Z][a-z]+\\.$", text) ~ text,
# New additon
# I Player picked up by this
grepl("^[A-Z]\\s[A-Z][a-z]+\\.$", text) ~ text
)
) %>%
filter(text != "") %>%
mutate(
speaker_abb = if_else(
speaker_abb %in% not_people,
NA_character_,
speaker_abb
)
) %>%
# Identify stage directions:
mutate(
last_stage_direction = case_when(
grepl("^\\[.*\\]$", text) ~ text,
#Identify opening [ to scene
grepl("^[]\\[]", text) ~ text
)
) %>%
fill(last_stage_direction) %>%
filter(is.na(last_stage_direction) | text != last_stage_direction) %>%
mutate(
last_stage_direction = gsub("[", "", last_stage_direction, fixed = TRUE),
last_stage_direction = gsub("]", "", last_stage_direction, fixed = TRUE)
) %>%
# Identify Act:
mutate(act = case_when(
grepl("^A[Cc][Tt]\\s", text) ~ text
)) %>%
fill(act) %>%
filter(is.na(act) | text != act) %>%
# Identify Scene:
mutate(scene = case_when(
grepl("^S[Cc][Ee][Nn][Ee]\\s", text) ~ text
)) %>%
fill(scene) %>%
filter(is.na(scene) | text != scene) %>%
fill(speaker_abb) %>% # new addition
left_join(personae, by = "speaker_abb") %>%
# regularise some spelling:
mutate(scene = gsub("Castle", "castle", scene)) %>%
# new addition: to consider lag between acts as well as scenes
mutate(
new_speaker_this_line = is.na(lag(speaker)) |
speaker != lag(speaker) |
scene != lag(scene) |
act != lag(act),
line_number_this_speech = ifelse(new_speaker_this_line, 1, NA), #if new speaker set 1
line_number_this_speech = replace_na_with_num(line_number_this_speech)
)
our_stopwords <- tibble(
word = c("thou", "thine", "o", "tis", "thee", "thy", "sir", "hath", "lord", "us", "one")
) %>%
mutate(lexicon = "made up") %>%
rbind(get_stopwords())
# Breaking down into words
hamlet_words <- hamlet_lines %>%
tidytext::unnest_tokens(
output ="word",
input = "text"
) %>%
left_join(
our_stopwords,
by = "word"
) %>%
mutate(stopword = !is.na(lexicon)) %>%
dplyr::select(-lexicon) %>%
mutate(word_stem = wordStem(word)) %>%
# identify the actual speeches and count the words in each speech by a continuing speaker
mutate(
new_speaker_this_word = is.na(lag(speaker)) |
speaker != lag(speaker) |
scene != lag(scene) |
act != lag(act),
word_number_this_speech = ifelse(new_speaker_this_word, 1, NA),
word_number_this_speech = replace_na_with_num(word_number_this_speech)
) %>%
mutate(word_number = 1:n()) %>%
group_by(act) %>%
mutate(word_number_this_act = 1:n()) %>%
group_by(act, scene) %>%
mutate(word_number_this_scene = 1:n()) %>%
ungroup()
And with this, we see that the number of speeches by Hamlet have been reconciled.
char_summary <- hamlet_words %>%
group_by(speaker_sh) %>%
summarise(
words = n(),
speeches = sum(new_speaker_this_word),
words_per_speech = words / speeches,
stopwords = sum(stopword),
prop_non_stop = 1 - stopwords / words,
most_words_single_speech = max(word_number_this_speech)
) %>%
arrange(desc(words))
char_summary %>%
rmarkdown::paged_table()