Blog Post 3

Zoe Bean
3/2/2022

Setting up the corpus

list<-read.csv("fic_list.csv")

#for(i in 1:nrow(list)){
 # print(list[i,2])
#}

for (i in 1:7){
  
  meta_info <- list[i,1] %>% 
  read_html() %>%
  html_nodes(css = ".meta") %>% 
  html_text2()

  title<-str_extract(meta_info, "(?<=[0-9]\\s).+(?=\\sby)") #gets fic title
  title<-title[1] #removes NAs

  author<-str_extract(meta_info, "(?<=by ).+(?=\\s)") #gets author username
  author<-author[1] #removes NAs

  tagged_fandom<-str_extract(meta_info, "(?<=Fandom:\\n).+(?=\\n)") #gets fandom name
  tagged_fandom<-tagged_fandom[1] #removes NAs
  
  tagged_relationships<-str_extract(meta_info, "(?<=Relationship:\\n).+(?=\\n)") #gets relationships
  tagged_relationships<-tagged_relationships[1] #removes NAs
  
  published_date<-str_extract(meta_info, "(?<=Published:\\s).+(?= Completed:)")#day first published; usually when first chapter is posted
  published_date<-published_date[1] #removes NAs
  
  completion_date<-str_extract(meta_info, "(?<=Completed:\\s).+(?= Chapters)")#day last chapter posted
  completion_date<-completion_date[1] #removes NAs
  
  chapter_count<-str_extract(meta_info, "(?<=Chapters:\\s).+(?=/)")#number of chpaters if more than one; otherwise not there
  chapter_count<-chapter_count[1] #removes all but first element from str-extrect
  
  if(is.na(chapter_count)){
    chapter_count<-1 #chapter count is not provided if there is only one chapter
  } else {
    chapter_count<-strtoi(chapter_count, base=0L)#turns count into number
  }
  
  the_text<-list[i,1] %>% 
    read_html() %>%
    html_nodes(css = "#chapters.userstuff .userstuff:not(blockquote)") %>% 
    html_text2()
    # create corpora
  corpusCall <- paste("fic_",list[i,2]," <- corpus(the_text,)", sep = "")
  eval(parse(text=corpusCall))

  # change document names for each chapter to include the book title. If you don't do this, the document names will be duplicated and you'll get an error.
  namesCall <- paste("tmpNames <- docnames(fic_",list[i,2],")", sep = "")
  eval(parse(text=namesCall))
  
  bindCall <- paste("docnames(fic_",list[i,2],") <- paste(\"",list[i,2],"\", tmpNames, sep = \"-\")", sep = "")
  eval(parse(text=bindCall))

  # create summary data
  summaryCall <- paste("summary_",list[i,2]," <- summary(fic_",list[i,2],")", sep = "")
  eval(parse(text=summaryCall))

  # add indicators
  titleCall <- paste("summary_",list[i,2],"$title <- title", sep = "")
  eval(parse(text=titleCall))
  
  authorCall <- paste("summary_",list[i,2],"$author <- author", sep = "")
  eval(parse(text=authorCall))
  
  tagged_fandomCall <- paste("summary_",list[i,2],"$tagged_fandom <- tagged_fandom", sep = "")
  eval(parse(text=tagged_fandomCall))
  
  tagged_relationshipsCall <- paste("summary_",list[i,2],"$tagged_relationships <-tagged_relationships", sep = "")
  eval(parse(text=tagged_relationshipsCall))
  
  published_dateCall <- paste("summary_",list[i,2],"$published_date <- published_date", sep = "")
  eval(parse(text=published_dateCall))
  
  completion_dateCall <- paste("summary_",list[i,2],"$completion_date <- completion_date", sep = "")
  eval(parse(text=completion_dateCall))
  
  chapter_countCall <- paste("summary_",list[i,2],"$chapter_count <- chapter_count", sep = "")
  eval(parse(text=chapter_countCall))

  # add meta data to each corpus
  metaCall <- paste("docvars(fic_",list[i,2],") <- summary_",list[i,2], sep = "")
  eval(parse(text=metaCall))

}

complete_corpus<-c(fic_1,fic_2, fic_3, fic_4, fic_5, fic_6, fic_7)

head(summary(complete_corpus))
     Text Types Tokens Sentences    Text Types Tokens Sentences
1 1-text1  1093   4171       294 1-text1  1093   4171       294
2 1-text2  1531   6712       459 1-text2  1531   6712       459
3 1-text3  1422   6195       444 1-text3  1422   6195       444
4 1-text4  1140   4169       289 1-text4  1140   4169       289
5 1-text5  1388   6297       505 1-text5  1388   6297       505
6 1-text6  1262   5301       418 1-text6  1262   5301       418
                     title         author
1 Yesterday Upon The Stair PitViperOfDoom
2 Yesterday Upon The Stair PitViperOfDoom
3 Yesterday Upon The Stair PitViperOfDoom
4 Yesterday Upon The Stair PitViperOfDoom
5 Yesterday Upon The Stair PitViperOfDoom
6 Yesterday Upon The Stair PitViperOfDoom
                                                                                                                        tagged_fandom
1 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
2 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
3 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
4 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
5 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
6 <U+50D5><U+306E><U+30D2><U+30FC><U+30ED><U+30FC><U+30A2><U+30AB><U+30C7><U+30DF><U+30A2> | Boku no Hero Academia | My Hero Academia
                                                                                                                                                                                                    tagged_relationships
1 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
2 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
3 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
4 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
5 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
6 Midoriya Izuku & Yagi Toshinori | All Might, Midoriya Izuku & Todoroki Shouto, Midoriya Izuku & Uraraka Ochako, Iida Tenya & Midoriya Izuku, Sir Nighteye & Yagi Toshinori | All Might, Midoriya Izuku & Toogata Mirio
  published_date completion_date chapter_count
1     2016-10-21      2019-10-12            60
2     2016-10-21      2019-10-12            60
3     2016-10-21      2019-10-12            60
4     2016-10-21      2019-10-12            60
5     2016-10-21      2019-10-12            60
6     2016-10-21      2019-10-12            60

Note: for some reason the metaCall docvars function throws an error on the 8th fic, which is why this is a list of 7, insted of going throught the whole list. I did not have time to investigate further.