Corpus Create code

Zoe Bean
library(rvest)
library(quanteda)
library(quanteda.textstats)
library(quanteda.textplots)
library(stringr)

list<-read.csv("fic_list.csv")


fic_names<-vector(mode="list", length=nrow(list))



for (i in 1:nrow(list)){
  meta_info <- list[i,1] %>% 
    read_html() %>%
    html_nodes(css = ".meta") %>% 
    html_text2()
  
  title<-str_extract(meta_info, "(?<=[0-9]\\n).+(?=\\sby)") #gets fic title
  title<-title[1] #removes NAs
  
  author<-str_extract(meta_info, "(?<=by ).+(?=\\s\\nSummary)") #gets author username
  author<-author[1] #removes NAs
  
  tagged_fandom<-str_extract(meta_info, "(?<=Fandom:\\n).+(?=\\n)") #gets fandom name
  tagged_fandom<-tagged_fandom[1] #removes NAs
  
  tagged_relationships<-str_extract(meta_info, "(?<=Relationship:\\n).+(?=\\n)") #gets relationships
  tagged_relationships<-tagged_relationships[1] #removes NAs
  
  tagged_characters<-str_extract(meta_info, "(?<=Character:\\n).+(?=\\n)") #gets characters
  tagged_characters<-tagged_characters[1] #removes NAs
  
  published_date<-str_extract(meta_info, "(?<=Published:\\s)[0-9][0-9][0-9][0-9][:punct:][0-9][0-9][:punct:][0-9][0-9]")#day first published; usually when first chapter is posted
  published_date<-published_date[1] #removes NAs
  
  completion_date<-str_extract(meta_info, "(?<=Completed:\\s).+(?= Chapters)")#day last chapter posted
  completion_date<-completion_date[1] #removes NAs
  
  
  chapter_count<-str_extract(meta_info, "(?<=Chapters:\\s).+(?=/)")#number of chpaters if more than one; otherwise not there
  chapter_count<-chapter_count[1] #removes all but first element from str-extrect
  
  if(is.na(chapter_count)){
    chapter_count<-1 #chapter count is not provided if there is only one chapter
    completion_date<-published_date #neither is the completion date separate
  } else {
    chapter_count<-strtoi(chapter_count, base=0L)#turns count into number
  }
  
  the_text<-list[i,1] %>% 
    read_html() %>%
    html_nodes(css = "#chapters.userstuff .userstuff:not(blockquote)") %>% 
    html_text2()
  # create corpora
  corpusCall <- paste("fic_",list[i,2]," <- corpus(the_text,)", sep = "")
  eval(parse(text=corpusCall))
  
  # change document names for each chapter to include the book title. If you don't do this, the document names will be duplicated and you'll get an error.
  namesCall <- paste("tmpNames <- docnames(fic_",list[i,2],")", sep = "")
  eval(parse(text=namesCall))
  
  bindCall <- paste("docnames(fic_",list[i,2],") <- paste(\"",list[i,2],"\", tmpNames, sep = \"-\")", sep = "")
  eval(parse(text=bindCall))
  
  # create summary data
  summaryCall <- paste("summary_",list[i,2]," <- summary(fic_",list[i,2],", length(fic_",list[i,2],"))", sep = "")
  eval(parse(text=summaryCall))
  
  # add indicators
  titleCall <- paste("summary_",list[i,2],"$title <- title", sep = "")
  eval(parse(text=titleCall))
  
  authorCall <- paste("summary_",list[i,2],"$author <- author", sep = "")
  eval(parse(text=authorCall))
  
  tagged_fandomCall <- paste("summary_",list[i,2],"$tagged_fandom <- tagged_fandom", sep = "")
  eval(parse(text=tagged_fandomCall))
  
  tagged_relationshipsCall <- paste("summary_",list[i,2],"$tagged_relationships <-tagged_relationships", sep = "")
  eval(parse(text=tagged_relationshipsCall))
  
  tagged_charsCall <- paste("summary_",list[i,2],"$tagged_characters <-tagged_characters", sep = "")
  eval(parse(text=tagged_charsCall))
  
  published_dateCall <- paste("summary_",list[i,2],"$published_date <- published_date", sep = "")
  eval(parse(text=published_dateCall))
  
  completion_dateCall <- paste("summary_",list[i,2],"$completion_date <- completion_date", sep = "")
  eval(parse(text=completion_dateCall))
  
  chapter_countCall <- paste("summary_",list[i,2],"$chapter_count <- chapter_count", sep = "")
  eval(parse(text=chapter_countCall))
  
  # add meta data to each corpus
  metaCall <- paste("docvars(fic_",list[i,2],") <- summary_",list[i,2], sep = "")
  eval(parse(text=metaCall))
  
  # add chapter indicator
  chapterCall <- paste("summary_",list[i,2],"$chapter <- as.numeric(str_extract(summary_",list[i,2],"$Text, \"(?<=[:alpha:])[0-9]+\"))", sep = "")
  eval(parse(text=chapterCall))
  
  #add to list of names for combination
  ficCall<-paste("fic_names[",i,"]<-'fic_",list[i,2],"'", sep="")
  eval(parse(text=ficCall))
  
  #clear temp vars
  rm(meta_info)
  rm(title)
  rm(author)
  rm(tagged_fandom)
  rm(tagged_relationships)
  rm(tagged_characters)
  rm(published_date)
  rm(completion_date)
  rm(chapter_count)
  rm(the_text)
  rm(metaCall)
  rm(namesCall)
  rm(published_dateCall)
  rm(summaryCall)
  rm(tagged_fandomCall)
  rm(tagged_relationshipsCall)
  rm(titleCall)
  rm(authorCall)
  rm(bindCall)
  rm(chapter_countCall)
  rm(corpusCall)
  rm(tmpNames)
  rm(chapterCall)
  rm(completion_dateCall)
  rm(ficCall)
  rm(tagged_charsCall)
}

#create complete corpus
complete_corpusCall<-paste("c(",paste(fic_names, collapse=","), ")")
complete_corpus<-eval(parse(text=complete_corpusCall))

rm(i)
rm(list)
rm(complete_corpusCall)
rm(fic_names)

cat(
  'creates following objects:
  -complete_corpus (the collated corpus of all works)
  -fic_[#] (the individual corpus for each work, numbered by popularity) 
  -summary_[#] (the metadata for each fic)
  
  
  the extracted metadata variables for the corpus are:
  -title
  -author
  -tagged_fandom
  -tagged_relationships
  -tagged_characters
  -published_date
  -completion_date
  -chapter_count
  '
)