list<-read.csv("fic_list.csv")
#for(i in 1:nrow(list)){
# print(list[i,2])
#}
for (i in 1:7){
meta_info <- list[i,1] %>%
read_html() %>%
html_nodes(css = ".meta") %>%
html_text2()
title<-str_extract(meta_info, "(?<=[0-9]\\s).+(?=\\sby)") #gets fic title
title<-title[1] #removes NAs
author<-str_extract(meta_info, "(?<=by ).+(?=\\s)") #gets author username
author<-author[1] #removes NAs
tagged_fandom<-str_extract(meta_info, "(?<=Fandom:\\n).+(?=\\n)") #gets fandom name
tagged_fandom<-tagged_fandom[1] #removes NAs
tagged_relationships<-str_extract(meta_info, "(?<=Relationship:\\n).+(?=\\n)") #gets relationships
tagged_relationships<-tagged_relationships[1] #removes NAs
published_date<-str_extract(meta_info, "(?<=Published:\\s).+(?= Completed:)")#day first published; usually when first chapter is posted
published_date<-published_date[1] #removes NAs
completion_date<-str_extract(meta_info, "(?<=Completed:\\s).+(?= Chapters)")#day last chapter posted
completion_date<-completion_date[1] #removes NAs
chapter_count<-str_extract(meta_info, "(?<=Chapters:\\s).+(?=/)")#number of chpaters if more than one; otherwise not there
chapter_count<-chapter_count[1] #removes all but first element from str-extrect
if(is.na(chapter_count)){
chapter_count<-1 #chapter count is not provided if there is only one chapter
} else {
chapter_count<-strtoi(chapter_count, base=0L)#turns count into number
}
the_text<-list[i,1] %>%
read_html() %>%
html_nodes(css = "#chapters.userstuff .userstuff:not(blockquote)") %>%
html_text2()
# create corpora
corpusCall <- paste("fic_",list[i,2]," <- corpus(the_text,)", sep = "")
eval(parse(text=corpusCall))
# change document names for each chapter to include the book title. If you don't do this, the document names will be duplicated and you'll get an error.
namesCall <- paste("tmpNames <- docnames(fic_",list[i,2],")", sep = "")
eval(parse(text=namesCall))
bindCall <- paste("docnames(fic_",list[i,2],") <- paste(\"",list[i,2],"\", tmpNames, sep = \"-\")", sep = "")
eval(parse(text=bindCall))
# create summary data
summaryCall <- paste("summary_",list[i,2]," <- summary(fic_",list[i,2],")", sep = "")
eval(parse(text=summaryCall))
# add indicators
titleCall <- paste("summary_",list[i,2],"$title <- title", sep = "")
eval(parse(text=titleCall))
authorCall <- paste("summary_",list[i,2],"$author <- author", sep = "")
eval(parse(text=authorCall))
tagged_fandomCall <- paste("summary_",list[i,2],"$tagged_fandom <- tagged_fandom", sep = "")
eval(parse(text=tagged_fandomCall))
tagged_relationshipsCall <- paste("summary_",list[i,2],"$tagged_relationships <-tagged_relationships", sep = "")
eval(parse(text=tagged_relationshipsCall))
published_dateCall <- paste("summary_",list[i,2],"$published_date <- published_date", sep = "")
eval(parse(text=published_dateCall))
completion_dateCall <- paste("summary_",list[i,2],"$completion_date <- completion_date", sep = "")
eval(parse(text=completion_dateCall))
chapter_countCall <- paste("summary_",list[i,2],"$chapter_count <- chapter_count", sep = "")
eval(parse(text=chapter_countCall))
# add meta data to each corpus
metaCall <- paste("docvars(fic_",list[i,2],") <- summary_",list[i,2], sep = "")
eval(parse(text=metaCall))
}
complete_corpus<-c(fic_1,fic_2, fic_3, fic_4, fic_5, fic_6, fic_7)
head(summary(complete_corpus))
Text Types Tokens Sentences Text Types Tokens Sentences
1 1-text1 1093 4171 294 1-text1 1093 4171 294
2 1-text2 1531 6712 459 1-text2 1531 6712 459
3 1-text3 1422 6195 444 1-text3 1422 6195 444
4 1-text4 1140 4169 289 1-text4 1140 4169 289
5 1-text5 1388 6297 505 1-text5 1388 6297 505
6 1-text6 1262 5301 418 1-text6 1262 5301 418
title author
1 Yesterday Upon The Stair PitViperOfDoom
2 Yesterday Upon The Stair PitViperOfDoom
3 Yesterday Upon The Stair PitViperOfDoom
4 Yesterday Upon The Stair PitViperOfDoom
5 Yesterday Upon The Stair PitViperOfDoom
6 Yesterday Upon The Stair PitViperOfDoom
tagged_fandom
1 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
2 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
3 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
4 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
5 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
6 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
tagged_relationships
1 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
2 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
3 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
4 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
5 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
6 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
published_date completion_date chapter_count
1 2016-10-21 2019-10-12 60
2 2016-10-21 2019-10-12 60
3 2016-10-21 2019-10-12 60
4 2016-10-21 2019-10-12 60
5 2016-10-21 2019-10-12 60
6 2016-10-21 2019-10-12 60
Note: for some reason the metaCall docvars function throws an error on the 8th fic, which is why this is a list of 7, insted of going throught the whole list. I did not have time to investigate further.