library(rvest)
library(quanteda)
library(quanteda.textstats)
library(quanteda.textplots)
library(stringr)
list<-read.csv("fic_list.csv")
fic_names<-vector(mode="list", length=nrow(list))
for (i in 1:nrow(list)){
meta_info <- list[i,1] %>%
read_html() %>%
html_nodes(css = ".meta") %>%
html_text2()
title<-str_extract(meta_info, "(?<=[0-9]\\n).+(?=\\sby)") #gets fic title
title<-title[1] #removes NAs
author<-str_extract(meta_info, "(?<=by ).+(?=\\s\\nSummary)") #gets author username
author<-author[1] #removes NAs
tagged_fandom<-str_extract(meta_info, "(?<=Fandom:\\n).+(?=\\n)") #gets fandom name
tagged_fandom<-tagged_fandom[1] #removes NAs
tagged_relationships<-str_extract(meta_info, "(?<=Relationship:\\n).+(?=\\n)") #gets relationships
tagged_relationships<-tagged_relationships[1] #removes NAs
tagged_characters<-str_extract(meta_info, "(?<=Character:\\n).+(?=\\n)") #gets characters
tagged_characters<-tagged_characters[1] #removes NAs
published_date<-str_extract(meta_info, "(?<=Published:\\s)[0-9][0-9][0-9][0-9][:punct:][0-9][0-9][:punct:][0-9][0-9]")#day first published; usually when first chapter is posted
published_date<-published_date[1] #removes NAs
completion_date<-str_extract(meta_info, "(?<=Completed:\\s).+(?= Chapters)")#day last chapter posted
completion_date<-completion_date[1] #removes NAs
chapter_count<-str_extract(meta_info, "(?<=Chapters:\\s).+(?=/)")#number of chpaters if more than one; otherwise not there
chapter_count<-chapter_count[1] #removes all but first element from str-extrect
if(is.na(chapter_count)){
chapter_count<-1 #chapter count is not provided if there is only one chapter
completion_date<-published_date #neither is the completion date separate
} else {
chapter_count<-strtoi(chapter_count, base=0L)#turns count into number
}
the_text<-list[i,1] %>%
read_html() %>%
html_nodes(css = "#chapters.userstuff .userstuff:not(blockquote)") %>%
html_text2()
# create corpora
corpusCall <- paste("fic_",list[i,2]," <- corpus(the_text,)", sep = "")
eval(parse(text=corpusCall))
# change document names for each chapter to include the book title. If you don't do this, the document names will be duplicated and you'll get an error.
namesCall <- paste("tmpNames <- docnames(fic_",list[i,2],")", sep = "")
eval(parse(text=namesCall))
bindCall <- paste("docnames(fic_",list[i,2],") <- paste(\"",list[i,2],"\", tmpNames, sep = \"-\")", sep = "")
eval(parse(text=bindCall))
# create summary data
summaryCall <- paste("summary_",list[i,2]," <- summary(fic_",list[i,2],", length(fic_",list[i,2],"))", sep = "")
eval(parse(text=summaryCall))
# add indicators
titleCall <- paste("summary_",list[i,2],"$title <- title", sep = "")
eval(parse(text=titleCall))
authorCall <- paste("summary_",list[i,2],"$author <- author", sep = "")
eval(parse(text=authorCall))
tagged_fandomCall <- paste("summary_",list[i,2],"$tagged_fandom <- tagged_fandom", sep = "")
eval(parse(text=tagged_fandomCall))
tagged_relationshipsCall <- paste("summary_",list[i,2],"$tagged_relationships <-tagged_relationships", sep = "")
eval(parse(text=tagged_relationshipsCall))
tagged_charsCall <- paste("summary_",list[i,2],"$tagged_characters <-tagged_characters", sep = "")
eval(parse(text=tagged_charsCall))
published_dateCall <- paste("summary_",list[i,2],"$published_date <- published_date", sep = "")
eval(parse(text=published_dateCall))
completion_dateCall <- paste("summary_",list[i,2],"$completion_date <- completion_date", sep = "")
eval(parse(text=completion_dateCall))
chapter_countCall <- paste("summary_",list[i,2],"$chapter_count <- chapter_count", sep = "")
eval(parse(text=chapter_countCall))
# add meta data to each corpus
metaCall <- paste("docvars(fic_",list[i,2],") <- summary_",list[i,2], sep = "")
eval(parse(text=metaCall))
# add chapter indicator
chapterCall <- paste("summary_",list[i,2],"$chapter <- as.numeric(str_extract(summary_",list[i,2],"$Text, \"(?<=[:alpha:])[0-9]+\"))", sep = "")
eval(parse(text=chapterCall))
#add to list of names for combination
ficCall<-paste("fic_names[",i,"]<-'fic_",list[i,2],"'", sep="")
eval(parse(text=ficCall))
#clear temp vars
rm(meta_info)
rm(title)
rm(author)
rm(tagged_fandom)
rm(tagged_relationships)
rm(tagged_characters)
rm(published_date)
rm(completion_date)
rm(chapter_count)
rm(the_text)
rm(metaCall)
rm(namesCall)
rm(published_dateCall)
rm(summaryCall)
rm(tagged_fandomCall)
rm(tagged_relationshipsCall)
rm(titleCall)
rm(authorCall)
rm(bindCall)
rm(chapter_countCall)
rm(corpusCall)
rm(tmpNames)
rm(chapterCall)
rm(completion_dateCall)
rm(ficCall)
rm(tagged_charsCall)
}
#create complete corpus
complete_corpusCall<-paste("c(",paste(fic_names, collapse=","), ")")
complete_corpus<-eval(parse(text=complete_corpusCall))
rm(i)
rm(list)
rm(complete_corpusCall)
rm(fic_names)
cat(
'creates following objects:
-complete_corpus (the collated corpus of all works)
-fic_[#] (the individual corpus for each work, numbered by popularity)
-summary_[#] (the metadata for each fic)
the extracted metadata variables for the corpus are:
-title
-author
-tagged_fandom
-tagged_relationships
-tagged_characters
-published_date
-completion_date
-chapter_count
'
)