The following is my attempt to extract Best Picture and Best Film Editing Oscar history from Wikipedia.
library(RCurl)
library(XML)
library(dplyr)
library(stringr)
library(tidyr)
#best picture
bp_url <- "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture#Winners_and_nominees"
bp_doc <- getURL(bp_url)
bp_tables <- readHTMLTable(bp_doc)
#only keep the tables we need
bp_tables <- bp_tables[9:89]
#function to add year and winner column to each table
add_year <- function(x, y){
x <- as.data.frame(x, stringsAsFactors = FALSE)
colnames(x) <- c("Film", "Production_Company", "Producers")
yr <- rep(y, nrow(x))
x$Year <- yr
x$Best_Pic_Oscar <- c("t", rep("f", nrow(x)-1))
return(x)
}
#combine all tables into one data frame
bp_data_frame <- data.frame(NA, NA, NA, NA, NA, stringsAsFactors = FALSE)
colnames(bp_data_frame) <- c("Film", "Production_Company", "Producers", "Year", "Best_Pic_Oscar")
y <- 1934
for(i in 1:length(bp_tables)){
#send each table to the add_year function
bp_data_frame <- rbind(bp_data_frame, add_year(bp_tables[i], y))
y <- y + 1
}
#film editing
#this was very difficult
#the information except for movies between 1950 and 1989 were in tables
#50-89 was a list
#the tables switched order of nominee and movie after the 50-89 list
edit_url <- "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Film_Editing"
edit_doc <- getURL(edit_url)
p_edit_doc <- htmlParse(edit_doc)
#tried to use xpath to retrieve info
#was getting tripped up on multiple editors for a movie
#this long xpath captured multiple editors as well as single nominees
list_nominees <- xpathSApply(p_edit_doc, "//tr/*[contains(text(), ', ')
or contains(text(), ' and ')
or not(contains(text(), ' and '))
or not(contains(text(), ', '))]", fun = xmlValue)
#this is a list of movies, editors and some junk in one list
nominees <- list_nominees[33:495]
#I know this is not how I should do it in R
year <- NA
film_editors <- NA
oscar_edit <- c(" ")
is_movie <- FALSE
is_winner <- FALSE
for(i in 1:length(nominees)){
if(grepl("Editor", nominees[i])){next} #pass on bad data
if(nominees[i] == "Year"){next} #pass on bad data
if(nominees[i] == "Film"){next} #pass on bad data
#the following is my attempt to straighten out the mess
if(grepl("^\\d{4}" , nominees[i])){
#set the yea
yr <- as.numeric(str_extract(nominees[i], "^\\d{4}"))
is_movie <- TRUE
is_winner <- TRUE
}
else {#not a year but a movie or nominee
if(is_movie == TRUE & is_winner == TRUE){
year <- append(year, yr)
oscar_edit <- append(oscar_edit, c("t"))
film_editors <- append(film_editors, nominees[i])
is_movie <- FALSE
is_winner <- TRUE
} else if (is_movie == FALSE & is_winner == TRUE){
film_editors <- append(film_editors, nominees[i])
is_winner <- FALSE
is_movie <- TRUE
} else if (is_movie == TRUE & is_winner == FALSE){
year <- append(year, yr)
oscar_edit <- append(oscar_edit, c("f"))
film_editors <- append(film_editors, nominees[i])
is_movie <- FALSE
is_winner <- FALSE
} else if (is_movie == FALSE & is_winner == FALSE){
film_editors <- append(film_editors, nominees[i])
is_movie <- TRUE
is_winner <- FALSE
}
}
}
#the tables switch the order of movie/editor after the 50s-80s list
Film <- append(film_editors[seq(3, 161, 2)], film_editors[seq(162, 410, 2)])
Editors <- append(film_editors[seq(2, 160, 2)], film_editors[seq(163, length(film_editors), 2)])
edit_data_frame <- data.frame(year[-1], Film, Editors, oscar_edit[-1], stringsAsFactors = FALSE)
colnames(edit_data_frame) <- c("Year", "Film", "Editor", "Edit_Oscar")
#join the best picture and best editing data frames
best_pic_best_edit_df <- left_join(bp_data_frame, edit_data_frame)
View(best_pic_best_edit_df)
#these movies were in lists
editing_50_89 <- xpathSApply(p_edit_doc, "//ul[position()>1]/li", fun = xmlValue)
editing_50_89 <- editing_50_89[-c(41:51)]
#editing_50_89 is a long string for each year containing all movies and editors (winner comes first)
#example: "1985 WitnessâÂ<U+0080>Â<U+0094>Thom Noble\nA Chorus LineâÂ<U+0080>Â<U+0094>John Bloom\nOut of AfricaâÂ<U+0080>Â<U+0094>Fredric Steinkamp,...
df_5089 <- data.frame(NA,NA,NA,NA)
colnames(df_5089) <- c("Year", "Film", "Editor", "Edit_Oscar")
#editing_50_89 is a long string for each year containing all movies and editors (winner comes first)
for(i in 1:length(editing_50_89)){
yr <- as.numeric(gsub(pattern = "(\\d{4})(.*)", replacement = "\\1", editing_50_89[i]))
nominees5089 <- gsub(pattern = "(\\d{4})(.*)", replacement = "\\2", editing_50_89[i])
nominees5089 <- sub("\\s+$", "", nominees5089)
nominees5089 <- sub("^\\s+", "", nominees5089)
nominees5089 <- unlist(str_split(nominees5089, "\n"))
Year <- rep(yr, length(nominees5089))
Edit_Oscar <- (c("t", rep("f", length(nominees5089)-1)))
temp_df <- data.frame(Year, nominees5089, Edit_Oscar)
temp_df <- temp_df %>% separate(nominees5089,
into = c("Film", "Editor"), sep = "\U2014")
df_5089 <- rbind(df_5089, temp_df)
}
best_pic_best_edit_df_1 <- left_join(subset(bp_data_frame, Year > 1989 | Year < 1950), edit_data_frame)
best_pic_best_edit_df_2 <- left_join(subset(bp_data_frame, Year > 1949 & Year < 1990), df_5089)
best_pic_best_edit_df <- rbind(best_pic_best_edit_df_1, best_pic_best_edit_df_2)
best_pic_best_edit_df <- arrange(best_pic_best_edit_df, desc(Year))
write.csv(best_pic_best_edit_df, "best_pic_editor.csv")
head(best_pic_best_edit_df)
## Film
## 1 Birdman or (The Unexpected Virtue of Ignorance)
## 2 American Sniper
## 3 Boyhood
## 4 The Grand Budapest Hotel
## 5 The Imitation Game
## 6 Selma
## Production_Company
## 1 Fox Searchlight, Regency Enterprises, Worldview Entertainment
## 2 Warner Bros., Village Roadshow Pictures, Mad Chance Productions, 22nd and Indiana Pictures, Malpaso
## 3 IFC Films
## 4 Fox Searchlight, American Empirical Pictures, Indian Paintbrush, Babelsberg Studio
## 5 The Weinstein Co., Black Bear Pictures, Bristol Automotive, StudioCanal
## 6 Paramount, Pathé, Cloud Eight Films, Plan B Entertainment, Harpo Films
## Producers
## 1 Alejandro G. Iñárritu, John Lesher and James W. Skotchdopole
## 2 Clint Eastwood, Andrew Lazar, Robert Lorenz, Bradley Cooper, and Peter Morgan
## 3 Richard Linklater and Cathleen Sutherland
## 4 Wes Anderson, Scott Rudin, Steven Rales, and Jeremy Dawson
## 5 Nora Grossman, Ido Ostrowsky, and Teddy Schwarzman
## 6 Christian Colson, Oprah Winfrey, Dede Gardner, and Jeremy Kleiner
## Year Best_Pic_Oscar Editor Edit_Oscar
## 1 2014 t <NA> <NA>
## 2 2014 f Joel Cox and Gary D. Roach f
## 3 2014 f Sandra Adair f
## 4 2014 f Barney Pilling f
## 5 2014 f William Goldenberg f
## 6 2014 f <NA> <NA>