This code will perform data scraping on the website http://awardsdatabase.oscars.org/ampas_awards/BasicSearch

The information needed from this website is behind a search box.

We can obtain all the raw data from 1927 to 2014.

Browser needed: Firefox -Please, install firefox on your computer

Packages needed - RSelenium - rvest - XML - RCurl

#Loading the packages

#install.packages("RSelenium")
#install.packages("rvest")

#install firefox browser

library(XML)
library(RCurl)
## Loading required package: bitops
library(RSelenium)
## Loading required package: RJSONIO
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:XML':
## 
##     xml
checkForServer() # download Selenium Server, if there is no server

# start Selenium Server
startServer()  

# instantiates a new driver
thebrowser <- remoteDriver()

# open connection
thebrowser$open()
## [1] "Connecting to remote server"
## $applicationCacheEnabled
## [1] TRUE
## 
## $rotatable
## [1] FALSE
## 
## $handlesAlerts
## [1] TRUE
## 
## $databaseEnabled
## [1] TRUE
## 
## $version
## [1] "41.0.2"
## 
## $platform
## [1] "WINDOWS"
## 
## $nativeEvents
## [1] FALSE
## 
## $acceptSslCerts
## [1] TRUE
## 
## $webdriver.remote.sessionid
## [1] "22855fa1-e573-4fcf-acf2-433f1121dda8"
## 
## $webStorageEnabled
## [1] TRUE
## 
## $locationContextEnabled
## [1] TRUE
## 
## $browserName
## [1] "firefox"
## 
## $takesScreenshot
## [1] TRUE
## 
## $javascriptEnabled
## [1] TRUE
## 
## $cssSelectorsEnabled
## [1] TRUE
## 
## $id
## [1] "22855fa1-e573-4fcf-acf2-433f1121dda8"
#Load the page and process it
thebrowser$navigate("http://awardsdatabase.oscars.org/ampas_awards/BasicSearch")

#Define our search criteria
#We are interested in the records starting from 1927
box1 <- thebrowser$findElement(using = 'name', "BSFromYear")
box1$sendKeysToElement(list("1927")) 


#We will like to have the data from the first oscars to the last one
box2 <- thebrowser$findElement(using = 'name', "BSToYear")
box2$sendKeysToElement(list("2014")) 


# We are interested in all the categories
box3 <- thebrowser$findElement(using = 'name', "BSCategory")
box3$sendKeysToElement(list("All")) 

#The results should be displayed by category and chronogically
box4 <- thebrowser$findElement(using = 'name', "displayType")
box4$sendKeysToElement(list("1")) 

#thebrowser$ExecuteScript(paste("scroll(100,1000);"))

#After specifying our searching criteria, we have to on Search
thebrowser$findElement(using = "xpath","//input[@value = 'Search']")$clickElement()

page_source<-thebrowser$getPageSource() #we get the source of the page in HTML

#We parse the source
rawdata<-read_html(page_source[[1]]) %>% html_nodes("dl") %>%html_text()

str(rawdata)
##  chr "\n\n\n1927/28 (1st)\n\n\nACTOR\n\n\nRichard Barthelmess -- The Noose {\"Nickie Elkins\"}; and The Patent Leather Kid {\"The Pat"| __truncated__
write.table(rawdata, file = "rawdata_from_awardsDatabase.txt")