The following is my attempt to extract Best Picture and Best Film Editing Oscar history from Wikipedia.

library(RCurl)
library(XML)
library(dplyr)
library(stringr)
library(tidyr)

#best picture
bp_url <- "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture#Winners_and_nominees"
bp_doc <- getURL(bp_url)
bp_tables <- readHTMLTable(bp_doc)

#only keep the tables we need
bp_tables <- bp_tables[9:89]

#function to add year and winner column to each table
add_year <- function(x, y){
  x <- as.data.frame(x, stringsAsFactors = FALSE)
  colnames(x) <- c("Film", "Production_Company", "Producers")
  yr <- rep(y, nrow(x))
  x$Year <- yr
  x$Best_Pic_Oscar <- c("t", rep("f", nrow(x)-1))
  return(x)
}

#combine all tables into one data frame
bp_data_frame <- data.frame(NA, NA, NA, NA, NA, stringsAsFactors = FALSE)
colnames(bp_data_frame) <- c("Film", "Production_Company", "Producers", "Year", "Best_Pic_Oscar")
y <- 1934
for(i in 1:length(bp_tables)){
  #send each table to the add_year function
  bp_data_frame <- rbind(bp_data_frame, add_year(bp_tables[i], y))
  y <- y + 1
}

#film editing
#this was very difficult
#the information except for movies between 1950 and 1989 were in tables
#50-89 was a list
#the tables switched order of nominee and movie after the 50-89 list
edit_url <- "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Film_Editing"
edit_doc <- getURL(edit_url)
p_edit_doc <- htmlParse(edit_doc)

#tried to use xpath to retrieve info
#was getting tripped up on multiple editors for a movie
#this long xpath captured multiple editors as well as single nominees
list_nominees <- xpathSApply(p_edit_doc, "//tr/*[contains(text(), ', ')
            or contains(text(), ' and ')
            or not(contains(text(), ' and '))
            or not(contains(text(), ', '))]", fun = xmlValue) 

#this is a list of movies, editors and some junk in one list
nominees <- list_nominees[33:495]

#I know this is not how I should do it in R
year <- NA
film_editors <- NA
oscar_edit <- c(" ")
is_movie <- FALSE
is_winner <- FALSE
for(i in 1:length(nominees)){
  if(grepl("Editor", nominees[i])){next} #pass on bad data
  if(nominees[i] == "Year"){next} #pass on bad data
  if(nominees[i] == "Film"){next} #pass on bad data
  
  #the following is my attempt to straighten out the mess
  if(grepl("^\\d{4}" , nominees[i])){
    #set the yea
    yr <- as.numeric(str_extract(nominees[i], "^\\d{4}"))
    is_movie <- TRUE
    is_winner <- TRUE
  } 
  else {#not a year but a movie or nominee
    if(is_movie == TRUE & is_winner == TRUE){
      year <- append(year, yr)
      oscar_edit <- append(oscar_edit, c("t"))
      film_editors <- append(film_editors, nominees[i])
      is_movie <- FALSE
      is_winner <- TRUE
    } else if (is_movie == FALSE & is_winner == TRUE){
      film_editors <- append(film_editors, nominees[i])
      is_winner <- FALSE
      is_movie <- TRUE
    } else if (is_movie == TRUE & is_winner == FALSE){
      year <- append(year, yr)
      oscar_edit <- append(oscar_edit, c("f"))
      film_editors <- append(film_editors, nominees[i])
      is_movie <- FALSE
      is_winner <- FALSE
    } else if (is_movie == FALSE & is_winner == FALSE){
      film_editors <- append(film_editors, nominees[i])
      is_movie <- TRUE
      is_winner <- FALSE
    }
  }
}

#the tables switch the order of movie/editor after the 50s-80s list
Film <- append(film_editors[seq(3, 161, 2)], film_editors[seq(162, 410, 2)])
Editors <- append(film_editors[seq(2, 160, 2)], film_editors[seq(163, length(film_editors), 2)])
edit_data_frame <- data.frame(year[-1], Film, Editors, oscar_edit[-1], stringsAsFactors = FALSE)
colnames(edit_data_frame) <- c("Year", "Film", "Editor", "Edit_Oscar")

#join the best picture and best editing data frames
best_pic_best_edit_df <- left_join(bp_data_frame, edit_data_frame)
View(best_pic_best_edit_df)


#these movies were in lists
editing_50_89 <- xpathSApply(p_edit_doc, "//ul[position()>1]/li", fun = xmlValue)
editing_50_89 <- editing_50_89[-c(41:51)]
#editing_50_89 is a long string for each year containing all movies and editors (winner comes first)
#example: "1985 WitnessâÂ<U+0080>Â<U+0094>Thom Noble\nA Chorus LineâÂ<U+0080>Â<U+0094>John Bloom\nOut of AfricaâÂ<U+0080>Â<U+0094>Fredric Steinkamp,...

df_5089 <- data.frame(NA,NA,NA,NA)
colnames(df_5089) <- c("Year", "Film", "Editor", "Edit_Oscar")
#editing_50_89 is a long string for each year containing all movies and editors (winner comes first)
for(i in 1:length(editing_50_89)){
  yr <- as.numeric(gsub(pattern = "(\\d{4})(.*)", replacement = "\\1", editing_50_89[i]))
  nominees5089 <- gsub(pattern = "(\\d{4})(.*)", replacement = "\\2", editing_50_89[i])
  nominees5089 <- sub("\\s+$", "", nominees5089)
  nominees5089 <- sub("^\\s+", "", nominees5089)
  nominees5089 <- unlist(str_split(nominees5089, "\n"))
  Year <- rep(yr, length(nominees5089))
  Edit_Oscar <- (c("t", rep("f", length(nominees5089)-1)))
  temp_df <- data.frame(Year, nominees5089, Edit_Oscar)
  temp_df <- temp_df %>% separate(nominees5089, 
                                  into = c("Film", "Editor"), sep = "\U2014")
  df_5089 <- rbind(df_5089, temp_df)
}


best_pic_best_edit_df_1 <- left_join(subset(bp_data_frame, Year > 1989 | Year < 1950), edit_data_frame)
best_pic_best_edit_df_2 <- left_join(subset(bp_data_frame, Year > 1949 & Year < 1990), df_5089)
best_pic_best_edit_df <- rbind(best_pic_best_edit_df_1, best_pic_best_edit_df_2)
best_pic_best_edit_df <- arrange(best_pic_best_edit_df, desc(Year))
write.csv(best_pic_best_edit_df, "best_pic_editor.csv")
head(best_pic_best_edit_df)
##                                              Film
## 1 Birdman or (The Unexpected Virtue of Ignorance)
## 2                                 American Sniper
## 3                                         Boyhood
## 4                        The Grand Budapest Hotel
## 5                              The Imitation Game
## 6                                           Selma
##                                                                                    Production_Company
## 1                                       Fox Searchlight, Regency Enterprises, Worldview Entertainment
## 2 Warner Bros., Village Roadshow Pictures, Mad Chance Productions, 22nd and Indiana Pictures, Malpaso
## 3                                                                                           IFC Films
## 4                  Fox Searchlight, American Empirical Pictures, Indian Paintbrush, Babelsberg Studio
## 5                             The Weinstein Co., Black Bear Pictures, Bristol Automotive, StudioCanal
## 6                              Paramount, Pathé, Cloud Eight Films, Plan B Entertainment, Harpo Films
##                                                                       Producers
## 1                  Alejandro G. Iñárritu, John Lesher and James W. Skotchdopole
## 2 Clint Eastwood, Andrew Lazar, Robert Lorenz, Bradley Cooper, and Peter Morgan
## 3                                     Richard Linklater and Cathleen Sutherland
## 4                    Wes Anderson, Scott Rudin, Steven Rales, and Jeremy Dawson
## 5                            Nora Grossman, Ido Ostrowsky, and Teddy Schwarzman
## 6             Christian Colson, Oprah Winfrey, Dede Gardner, and Jeremy Kleiner
##   Year Best_Pic_Oscar                     Editor Edit_Oscar
## 1 2014              t                       <NA>       <NA>
## 2 2014              f Joel Cox and Gary D. Roach          f
## 3 2014              f               Sandra Adair          f
## 4 2014              f             Barney Pilling          f
## 5 2014              f         William Goldenberg          f
## 6 2014              f                       <NA>       <NA>