This code will perform data scraping on the website http://awardsdatabase.oscars.org/ampas_awards/BasicSearch
The information needed from this website is behind a search box.
We can obtain all the raw data from 1927 to 2014.
Browser needed: Firefox -Please, install firefox on your computer
Packages needed - RSelenium - rvest - XML - RCurl
#Loading the packages
#install.packages("RSelenium")
#install.packages("rvest")
#install firefox browser
library(XML)
library(RCurl)
## Loading required package: bitops
library(RSelenium)
## Loading required package: RJSONIO
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:XML':
##
## xml
checkForServer() # download Selenium Server, if there is no server
# start Selenium Server
startServer()
# instantiates a new driver
thebrowser <- remoteDriver()
# open connection
thebrowser$open()
## [1] "Connecting to remote server"
## $applicationCacheEnabled
## [1] TRUE
##
## $rotatable
## [1] FALSE
##
## $handlesAlerts
## [1] TRUE
##
## $databaseEnabled
## [1] TRUE
##
## $version
## [1] "41.0.2"
##
## $platform
## [1] "WINDOWS"
##
## $nativeEvents
## [1] FALSE
##
## $acceptSslCerts
## [1] TRUE
##
## $webdriver.remote.sessionid
## [1] "22855fa1-e573-4fcf-acf2-433f1121dda8"
##
## $webStorageEnabled
## [1] TRUE
##
## $locationContextEnabled
## [1] TRUE
##
## $browserName
## [1] "firefox"
##
## $takesScreenshot
## [1] TRUE
##
## $javascriptEnabled
## [1] TRUE
##
## $cssSelectorsEnabled
## [1] TRUE
##
## $id
## [1] "22855fa1-e573-4fcf-acf2-433f1121dda8"
#Load the page and process it
thebrowser$navigate("http://awardsdatabase.oscars.org/ampas_awards/BasicSearch")
#Define our search criteria
#We are interested in the records starting from 1927
box1 <- thebrowser$findElement(using = 'name', "BSFromYear")
box1$sendKeysToElement(list("1927"))
#We will like to have the data from the first oscars to the last one
box2 <- thebrowser$findElement(using = 'name', "BSToYear")
box2$sendKeysToElement(list("2014"))
# We are interested in all the categories
box3 <- thebrowser$findElement(using = 'name', "BSCategory")
box3$sendKeysToElement(list("All"))
#The results should be displayed by category and chronogically
box4 <- thebrowser$findElement(using = 'name', "displayType")
box4$sendKeysToElement(list("1"))
#thebrowser$ExecuteScript(paste("scroll(100,1000);"))
#After specifying our searching criteria, we have to on Search
thebrowser$findElement(using = "xpath","//input[@value = 'Search']")$clickElement()
page_source<-thebrowser$getPageSource() #we get the source of the page in HTML
#We parse the source
rawdata<-read_html(page_source[[1]]) %>% html_nodes("dl") %>%html_text()
str(rawdata)
## chr "\n\n\n1927/28 (1st)\n\n\nACTOR\n\n\nRichard Barthelmess -- The Noose {\"Nickie Elkins\"}; and The Patent Leather Kid {\"The Pat"| __truncated__
write.table(rawdata, file = "rawdata_from_awardsDatabase.txt")