This page features a step by step guide to web-scrape data from a HTML webpage. Data compiled from https://www.realestate.com.au/auction-results/vic on 22nd July 2018
library(rvest)
## Loading required package: xml2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
url="https://www.realestate.com.au/auction-results/vic"
#Reading the HTML code from the website
webpage = read_html(url)
#What do I want?
#Address
#Suburb
#Match the address with suburb
#Price
#Bedroom
#Property Type
#Auction Result
#Date
#Agent
#URL
#scrap address + suburb section using html_nodes
Address = html_nodes(webpage, '.col-address, .col-suburb-name')
class(Address)
## [1] "xml_nodeset"
#Converting address data from xxml_nodeset to text using html_text
Address =html_text(Address)
#Let's have a look at the address
class(Address)
## [1] "character"
#Let's convert address to a single vector dataframe. Column name -> Address
address_df = data.frame(Address)
#scrap suburb section using html_nodes
Suburb = html_nodes(webpage,'.col-suburb-name')
#Converting the suburb data to text
Suburb = html_text(Suburb)
#Let's convert suburb to a single vector dataframe. Column name -> Suburb
suburb_df = data.frame(Suburb)
#Let's match the address with the suburb
#Basically, the match function looks at all values in address_df, that match values in #suburb_df$Suburb
#then subsetting it, just prints out the suburbs that meet this condition
address_df$Suburb <- suburb_df$Suburb[match(address_df$Address, suburb_df$Suburb)]
# Fill the NAs with suburb prior to it as this suburb was the heading.
address_df$Suburb <- na.locf(address_df$Suburb)
?na.locf() #A function in which each NA in the input object is replaced by the most recent non-NA prior to it.
# Remove identical rows
address_df <- subset(address_df, as.character(Address) != as.character(Suburb))
head(address_df); nrow(address_df)
## Address Suburb
## 2 10A Watt St Airport West
## 4 74 Evergreen Av Albanvale
## 6 66 Barrett St Albert Park
## 7 18 Page St Albert Park
## 9 2/8 Mitchell Av Altona North
## 10 16 Allan St Altona North
## [1] 550
#scrap price section using html_nodes
Price = html_nodes(webpage,' .col-property-price')
#Converting the price to text
Price = html_text(Price)
#Converting Price to data frame.
price_df <- data.frame(Price)
#Let's filter out the rows with no actual price
price_df <- filter(price_df, Price != "Price")
#nrow to make sure all number of rows are the same.
head(price_df); nrow(price_df)
## Price
## 1 $925,000
## 2 -
## 3 $2,305,000
## 4 $2,740,000
## 5 $685,000
## 6 -
## [1] 550
#scrap property type section using html_nodes
Property_Type = html_nodes(webpage,'.col-property-type')
#Converting the property type to text
Property_Type = html_text(Property_Type)
#Converting property type to data frame.
property_type_df <- data.frame(Property_Type)
#Let's filter out the rows with "Type"
property_type_df <- filter(property_type_df, Property_Type != "Type")
#nrow to make sure all number of rows are the same.
head(property_type_df); nrow(property_type_df)
## Property_Type
## 1 House
## 2 House
## 3 House
## 4 House
## 5 Unit
## 6 House
## [1] 550
#scrap beds section using html_nodes
Beds = html_nodes(webpage,'.col-num-beds')
#Converting the beds to text
Beds = html_text(Beds)
#Converting beds to data frame.
beds_df <- data.frame(Beds)
#Let's filter out the rows with "Beds"
beds_df <- filter(beds_df, beds_df != "Beds")
#nrow to make sure all number of rows are the same.
head(beds_df); nrow(beds_df)
## Beds
## 1 4
## 2 3
## 3 4
## 4 3
## 5 3
## 6 3
## [1] 550
#scrap auction section using html_nodes
Auction_Result = html_nodes(webpage,'.col-auction-result')
#Converting the auction results to text
Auction_Result = html_text(Auction_Result)
#Converting auction results to data frame.
auction_result_df <- data.frame(Auction_Result)
#Let's filter out the rows with "Result"
auction_result_df <- filter(auction_result_df, Auction_Result != "Result")
#nrow to make sure all number of rows are the same.
head(auction_result_df); nrow(auction_result_df)
## Auction_Result
## 1 Passed In - Vendor Bid
## 2 Passed In
## 3 Sold At Auction
## 4 Sold At Auction
## 5 Sold At Auction
## 6 Passed In
## [1] 550
#scrap date section using html_nodes
Date = html_nodes(webpage,'.col-auction-date')
#Converting the date to text
Date = html_text(Date)
#Converting date to data frame.
Date_df <- data.frame(Date)
#Let's filter out the rows with "Date"
Date_df <- filter(Date_df, Date != "Date")
#nrow to make sure all number of rows are the same.
head(Date_df); nrow(Date_df)
## Date
## 1 21/07/18
## 2 21/07/18
## 3 21/07/18
## 4 21/07/18
## 5 21/07/18
## 6 21/07/18
## [1] 550
#scrap agent section using html_nodes
Agent = html_nodes(webpage,'.col-agent, .col-agent ellipsis')
#Converting the agent to text
Agent = html_text(Agent)
#Converting Agent to data frame.
agent_df <- data.frame(Agent)
#Let's filter out the rows with "Agency"
agent_df <- filter(agent_df, Agent != "Agency")
#nrow to make sure all number of rows are the same.
head(agent_df); nrow(agent_df)
## Agent
## 1 Barry Plant Essendon\n
## 2 Sahara Real Estate - Truganina\n
## 3 Greg Hocking Holdsworth\n
## 4 Greg Hocking Holdsworth\n
## 5 Barry Plant - Newport\n
## 6 Greg Hocking Elly Partners\n
## [1] 550
#scrap date section using html_nodes
Date = html_nodes(webpage,'.col-auction-date')
#Converting the date to text
Date = html_text(Date)
#Converting date to data frame.
Date_df <- data.frame(Date)
#Let's filter out the rows with "Date"
Date_df <- filter(Date_df, Date != "Date")
#nrow to make sure all number of rows are the same.
head(Date_df); nrow(Date_df)
## Date
## 1 21/07/18
## 2 21/07/18
## 3 21/07/18
## 4 21/07/18
## 5 21/07/18
## 6 21/07/18
## [1] 550
#scrap agent section using html_nodes
Agent = html_nodes(webpage,'.col-agent, .col-agent ellipsis')
#Converting the agent to text
Agent = html_text(Agent)
head(Agent)
## [1] "Agency"
## [2] "Barry Plant Essendon\n "
## [3] "Agency"
## [4] "Sahara Real Estate - Truganina\n "
## [5] "Agency"
## [6] "Greg Hocking Holdsworth\n "
#Let's do some data cleaning.
Agent=gsub("\n.*","",Agent)
head(Agent)
## [1] "Agency" "Barry Plant Essendon"
## [3] "Agency" "Sahara Real Estate - Truganina"
## [5] "Agency" "Greg Hocking Holdsworth"
#Converting Agent to data frame.
agent_df <- data.frame(Agent)
#Let's filter out the rows with "Agency"
agent_df <- filter(agent_df, Agent != "Agency")
#nrow to make sure all number of rows are the same.
head(agent_df); nrow(agent_df)
## Agent
## 1 Barry Plant Essendon
## 2 Sahara Real Estate - Truganina
## 3 Greg Hocking Holdsworth
## 4 Greg Hocking Holdsworth
## 5 Barry Plant - Newport
## 6 Greg Hocking Elly Partners
## [1] 550
#scrap url section using html_nodes
URL_node = html_nodes(webpage,'.col-address')
URL_node
## {xml_nodeset (550)}
## [1] <a href="/128722914" class="col-address">10A Watt St</a>
## [2] <a href="/128597974" class="col-address">74 Evergreen Av</a>
## [3] <a href="/128703422" class="col-address">66 Barrett St</a>
## [4] <a href="/128683478" class="col-address">18 Page St</a>
## [5] <a href="/128678506" class="col-address">2/8 Mitchell Av</a>
## [6] <a href="/127400270" class="col-address">16 Allan St</a>
## [7] <a href="/128643366" class="col-address">1/102 Maxweld St</a>
## [8] <a href="/128703046" class="col-address">10/71 Denbigh Rd</a>
## [9] <a href="/128711730" class="col-address">17A Ferguson St</a>
## [10] <a href="/128626506" class="col-address">9 Geddes St</a>
## [11] <a href="/128659922" class="col-address">1/6 Crete Av</a>
## [12] <div class="col-address">45 Fakenham Rd</div>
## [13] <a href="/128682954" class="col-address">1/211 Huntingdale Rd</a>
## [14] <a href="/128687990" class="col-address">7B Lawrence Av</a>
## [15] <a href="/128698282" class="col-address">17/5 Mcintosh Ct</a>
## [16] <a href="/128678562" class="col-address">16 The Sands</a>
## [17] <a href="/128598346" class="col-address">9 Sovereign Way</a>
## [18] <a href="/128659678" class="col-address">10 Doyle St</a>
## [19] <a href="/128702086" class="col-address">12/84 Westbury St</a>
## [20] <a href="/128701022" class="col-address">4/60 Sycamore Gv</a>
## ...
#As this is a hyperlink, make sure you add the attribute "href" to get the link url
URL <- html_attr(URL_node, "href")
#adding the whole URL name.
URL <- paste0("https://www.realestate.com.au",URL)
head(URL)
## [1] "https://www.realestate.com.au/128722914"
## [2] "https://www.realestate.com.au/128597974"
## [3] "https://www.realestate.com.au/128703422"
## [4] "https://www.realestate.com.au/128683478"
## [5] "https://www.realestate.com.au/128678506"
## [6] "https://www.realestate.com.au/127400270"
#Converting URL to data frame.
URL_df <- data.frame(URL)
#nrow to make sure all number of rows are the same.
head(URL_df); nrow(URL_df)
## URL
## 1 https://www.realestate.com.au/128722914
## 2 https://www.realestate.com.au/128597974
## 3 https://www.realestate.com.au/128703422
## 4 https://www.realestate.com.au/128683478
## 5 https://www.realestate.com.au/128678506
## 6 https://www.realestate.com.au/127400270
## [1] 550
# merge all
AUCTION <- data.frame(address_df, price_df, beds_df, property_type_df, auction_result_df, Date_df, agent_df, URL_df)
nrow(AUCTION) # 550
## [1] 550
head(AUCTION)
## Address Suburb Price Beds Property_Type
## 2 10A Watt St Airport West $925,000 4 House
## 4 74 Evergreen Av Albanvale - 3 House
## 6 66 Barrett St Albert Park $2,305,000 4 House
## 7 18 Page St Albert Park $2,740,000 3 House
## 9 2/8 Mitchell Av Altona North $685,000 3 Unit
## 10 16 Allan St Altona North - 3 House
## Auction_Result Date Agent
## 2 Passed In - Vendor Bid 21/07/18 Barry Plant Essendon
## 4 Passed In 21/07/18 Sahara Real Estate - Truganina
## 6 Sold At Auction 21/07/18 Greg Hocking Holdsworth
## 7 Sold At Auction 21/07/18 Greg Hocking Holdsworth
## 9 Sold At Auction 21/07/18 Barry Plant - Newport
## 10 Passed In 21/07/18 Greg Hocking Elly Partners
## URL
## 2 https://www.realestate.com.au/128722914
## 4 https://www.realestate.com.au/128597974
## 6 https://www.realestate.com.au/128703422
## 7 https://www.realestate.com.au/128683478
## 9 https://www.realestate.com.au/128678506
## 10 https://www.realestate.com.au/127400270