library(tidyverse)
library(dplyr)
library(RSQLite)
There is a Wikibooks data set available on Kaggle and I wanted to explore it a bit. I was able to extract the ISBN for a percentage of the books for all languages by looking for the first occurrence af the string “ISBN” and cleaning up the portion of text after that. Taking only numbers then ensure the result is the correct length.
The method extracted something that looks like an ISBN for about 2% of the books.
The first thing I wanted to check is how many books there are in each of the different languages.
This code threw errors on my machine indicating that not all rows were retrieved however I was able to check the source and my result is a full data set.
Note also the full dataset is a couple of gigs so the script took a long timee to process on my MacBookPro, I didn’t time it but could have been over an hour.
wb_data_base <- file.path('/Users',
'macatron',
'Documents',
'Data Analysis',
'R',
'wiki--books',
'wikibooks.sqlite')
#creating connection
con <- DBI::dbConnect(RSQLite::SQLite(), wb_data_base)
# Lookup to get the full language from the two character part
Languages <- data.frame(Abbv = c('de',
'en',
'es',
'fr',
'he',
'hu',
'it',
'ja',
'nl',
'pl',
'pt',
'ru'),
full_name = c('German',
'English',
'Spanish',
'French',
'Greek',
'Hungarian',
'Italian',
'Japanese',
'Dutch',
'Polish',
'Portuguese',
'Russian'))
names_stripper <- function(full_title)
#Function that removes 'Wikibooks: ' from the start of the title and everything after and including the first slash
{
title_no_prefix <- full_title %>%
str_remove_all('Wikibooks: ')
slash_loc <- str_locate(title_no_prefix, '/')[1]
if (is.na(slash_loc)) {
title_no_suffix <- title_no_prefix }
else if (slash_loc > 0) {
title_no_suffix <- title_no_prefix %>%
str_sub(1, slash_loc-1)} else {
print('Error Error Does Not Compute')
title_no_suffix <- 'Error Error Does Not Compute'
}
title_no_suffix }
title_counter <- function(languages_brv)
# function that counts how many titles there ar in the data base for that language
{
con <- DBI::dbConnect(RSQLite::SQLite(), wb_data_base)
x <- dbSendQuery(con, paste0("SELECT count (*) FROM ", languages_brv), n = Inf) %>%
dbFetch(n =Inf)
x <- data.frame(titles = x, Abbv = languages_brv)
}
Titles_table <- map_dfr(Languages$Abbv, title_counter)
Titles_table <- Titles_table %>%
left_join(Languages, by = c('Abbv' = 'Abbv')) %>%
arrange(desc(count....))
Titles_table <- Titles_table %>%
mutate(Language = factor(full_name, levels = Titles_table$full_name)) %>%
rename(Titles = count....)
dbDisconnect(con)
The vast majority of books are English,
Titles_table %>%
ggplot(aes(x = Language, y = Titles)) +
geom_col(fill = 'cornflower blue') +
geom_label(aes(label = Titles), size = 2.5) +
theme_minimal() +
labs(title = 'Wikibooks Number of Titles by Language',
subtitle = 'Extraction date: Sep 09 2021',
caption = 'Source: Kaggle, starter-wikibooks-dataset') +
coord_flip()
# note this step takes quite a while on my MacBook, like 30 mimns.
con <- DBI::dbConnect(RSQLite::SQLite(), wb_data_base)
Table_puller <- function(tbl_name) {table <- dbSendQuery(con,
paste0("SELECT * FROM ",
tbl_name),
n = Inf) %>%
dbFetch(n =Inf)
table$language <- tbl_name
table
}
all_lang <- map_dfr(Languages$Abbv, Table_puller)
dbDisconnect(con)
The ISBNS are extracted and added to the data frame.
# function takes the whole text of a book and returns a cleaned ISBN number
ISBN_puller <- function(book_text) {
clean_ISBN <- NA
ISBN_Loc <- str_locate(book_text, 'ISBN')[1]
if (is.na(ISBN_Loc)) {NA} else {
un_clean_ISBN <- substr(book_text,
ISBN_Loc,
ISBN_Loc + 30)
clean_ISBN <- un_clean_ISBN %>%
str_extract_all("[0-9]") %>%
unlist() %>%
str_c(collapse = '')
if (nchar(clean_ISBN) >= 13 ) {
clean_ISBN <- substr(clean_ISBN, 1, 13) }
else if (nchar(clean_ISBN) >= 10) {
clean_ISBN <- substr(clean_ISBN, 1, 10) } else {
clean_ISBN <- NA}
}
clean_ISBN
}
ISBNs <- map_chr(all_lang$body_text, ISBN_puller)
all_lang$ISBN <- ISBNs
ISBN_books <- all_lang %>%
filter(!is.na(ISBN))
# saving ISBN_books as an r-file to save creating it again
# tidying up to save some memory
saveRDS(ISBN_books, 'ISBN_books.rds')
rm(all_lang)
rm(ISBNs)
Books_with_ISBN <- ISBN_books %>%
group_by(language) %>%
summarise(ISBNs = n())
Books_with_ISBN <- Books_with_ISBN %>%
left_join(Titles_table, by = c('language' = 'Abbv'))
ISBN_pct_Table <- Books_with_ISBN %>%
mutate(ISBN_Pct = (ISBNs / Titles) * 100 %>% round(0)) %>%
arrange(desc(ISBN_Pct))
ISBN_pct_Table <- ISBN_pct_Table %>%
mutate(Language = factor(Language, levels = ISBN_pct_Table$Language))
ISBN_pct_Table %>%
ggplot(aes(x = Language, y = ISBN_Pct)) +
geom_col(fill = 'cornflower blue') +
geom_label(aes(label = ISBN_Pct %>% round(2)), size = 2.5) +
theme_minimal() +
labs(title = 'Wikibooks Percent With Extractable ISBN by Language',
subtitle = 'Extraction date: Sep 09 2021',
caption = 'Source: Kaggle, starter-wikibooks-dataset') +
coord_flip()
I thought this would work for a higher percentage of books, but I was pleased the method worked for all languages, especially that it worked for Japanese.
Apply the method to the HTML text for the book rather than the full text.
Link the ISBN number to a library data base to add additional information to the books such as author, subject etc.