Read in Montag and Hudson-Kam et al. (2017) titles
d_raw <- read_lines("../../data/raw/montag_corpus/100Books.txt") %>%
as.data.frame() %>%
rename("text" = ".")
montag_titles <- d_raw %>%
filter(str_detect(text, regex("^Title", ignore_case = TRUE))) %>%
rename(title = text) %>%
mutate(book_id = 1:n(),
title = str_replace(title, "Title: ", "")) %>%
select(book_id, title) %>%
mutate(title = tolower(title),
title = fct_recode(title, "click, clack, moo: cows that type" = "click, clack, moo cows that type",
"good night, gorilla"= "good night gorilla",
"don't let the pigeon drive the bus!" = "don’t let the pigeon drive the bus",
"chrysanthemum"= "chrysanthemum ",
"oh, the places you’ll go!" = "oh, the places you’ll go",
"i'm a big sister"= "i’m a big sister",
"chicka chicka 123"= "chicka chicka 1-2-3"))
ibdb_titles <- read_csv("../../data/raw/norms/IBDb.csv") %>%
select(book1, book2, book3, book4, book5) %>%
gather(book, title) %>%
mutate(title = tolower(title)) %>%
filter(!is.na(title)) %>%
count(title) %>%
arrange(-n)
#ggplot(ibdb_titles, aes (x = n)) +
# geom_histogram() +
# theme_classic()
There are 2232 unique titles in the Hudson-Kam dataset.
63/100 books in the Montag dataset are also in the Hudson-Kam dataset. Here are the top 100 books in the Hudson-Kam dataset that are NOT in the Montag dataset.
ibdb_books_in_montag <- intersect(ibdb_titles$title,
montag_titles$title)
montag_books_missing_from_ibdb <- setdiff(montag_titles$title,
ibdb_titles$title)
ibdb_titles %>%
filter(!(title %in% ibdb_books_in_montag)) %>%
arrange(-n) %>%
slice(1:100) %>%
data.frame() %>%
datatable()