Webscraping Real Estate Data

This page features a step by step guide to web-scrape data from a HTML webpage. Data compiled from https://www.realestate.com.au/auction-results/vic on 22nd July 2018

library(rvest)

## Loading required package: xml2

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

url="https://www.realestate.com.au/auction-results/vic"

#Reading the HTML code from the website

webpage = read_html(url)


#What do I want?
#Address
#Suburb
#Match the address with suburb
#Price
#Bedroom
#Property Type
#Auction Result
#Date
#Agent
#URL


#scrap address + suburb section using html_nodes
Address = html_nodes(webpage, '.col-address, .col-suburb-name')

class(Address)

## [1] "xml_nodeset"

#Converting address data from xxml_nodeset to text using html_text
Address =html_text(Address)

#Let's have a look at the address
class(Address)

## [1] "character"

#Let's convert address to a single vector dataframe. Column name -> Address
address_df = data.frame(Address)

#scrap suburb section using html_nodes
Suburb = html_nodes(webpage,'.col-suburb-name')

#Converting the suburb data to text
Suburb = html_text(Suburb)

#Let's convert suburb to a single vector dataframe. Column name -> Suburb

suburb_df = data.frame(Suburb)

#Let's match the address with the suburb
#Basically, the match function looks at all values in address_df, that match values in #suburb_df$Suburb

#then subsetting it, just prints out the suburbs that meet this condition

address_df$Suburb <- suburb_df$Suburb[match(address_df$Address, suburb_df$Suburb)]

# Fill the NAs with suburb prior to it as this suburb was the heading.
address_df$Suburb <- na.locf(address_df$Suburb)

?na.locf() #A function in which each NA in the input object is replaced by the most recent non-NA prior to it.

# Remove identical rows
address_df <- subset(address_df, as.character(Address) != as.character(Suburb))
head(address_df); nrow(address_df)

##            Address       Suburb
## 2      10A Watt St Airport West
## 4  74 Evergreen Av    Albanvale
## 6    66 Barrett St  Albert Park
## 7       18 Page St  Albert Park
## 9  2/8 Mitchell Av Altona North
## 10     16 Allan St Altona North

## [1] 550

#scrap price section using html_nodes
Price = html_nodes(webpage,' .col-property-price')

#Converting the price to text
Price = html_text(Price)

#Converting Price to data frame.
price_df <- data.frame(Price)

#Let's filter out the rows with no actual price
price_df <- filter(price_df, Price != "Price")

#nrow to make sure all number of rows are the same.
head(price_df); nrow(price_df)

##        Price
## 1   $925,000
## 2          -
## 3 $2,305,000
## 4 $2,740,000
## 5   $685,000
## 6          -

## [1] 550

#scrap property type section using html_nodes
Property_Type = html_nodes(webpage,'.col-property-type')

#Converting the property type to text
Property_Type = html_text(Property_Type)

#Converting property type to data frame.
property_type_df <- data.frame(Property_Type)

#Let's filter out the rows with "Type"

property_type_df <- filter(property_type_df, Property_Type != "Type")

#nrow to make sure all number of rows are the same.
head(property_type_df); nrow(property_type_df)

##   Property_Type
## 1         House
## 2         House
## 3         House
## 4         House
## 5          Unit
## 6         House

## [1] 550

#scrap beds section using html_nodes
Beds = html_nodes(webpage,'.col-num-beds')

#Converting the beds to text
Beds = html_text(Beds)

#Converting beds to data frame.
beds_df <- data.frame(Beds)

#Let's filter out the rows with "Beds"

beds_df <- filter(beds_df, beds_df != "Beds")

#nrow to make sure all number of rows are the same.
head(beds_df); nrow(beds_df)

##   Beds
## 1    4
## 2    3
## 3    4
## 4    3
## 5    3
## 6    3

## [1] 550

#scrap auction section using html_nodes
Auction_Result = html_nodes(webpage,'.col-auction-result')

#Converting the auction results to text
Auction_Result = html_text(Auction_Result)

#Converting auction results to data frame.
auction_result_df <- data.frame(Auction_Result)

#Let's filter out the rows with "Result"

auction_result_df <- filter(auction_result_df, Auction_Result != "Result")

#nrow to make sure all number of rows are the same.
head(auction_result_df); nrow(auction_result_df)

##           Auction_Result
## 1 Passed In - Vendor Bid
## 2              Passed In
## 3        Sold At Auction
## 4        Sold At Auction
## 5        Sold At Auction
## 6              Passed In

## [1] 550

#scrap date section using html_nodes
Date = html_nodes(webpage,'.col-auction-date')

#Converting the date to text
Date = html_text(Date)

#Converting date to data frame.
Date_df <- data.frame(Date)

#Let's filter out the rows with "Date"

Date_df <- filter(Date_df, Date != "Date")

#nrow to make sure all number of rows are the same.
head(Date_df); nrow(Date_df)

##       Date
## 1 21/07/18
## 2 21/07/18
## 3 21/07/18
## 4 21/07/18
## 5 21/07/18
## 6 21/07/18

## [1] 550

#scrap agent section using html_nodes
Agent = html_nodes(webpage,'.col-agent, .col-agent ellipsis')

#Converting the agent to text
Agent = html_text(Agent)

#Converting Agent to data frame.
agent_df <- data.frame(Agent)

#Let's filter out the rows with "Agency"

agent_df <- filter(agent_df, Agent != "Agency")

#nrow to make sure all number of rows are the same.
head(agent_df); nrow(agent_df)

##                                                    Agent
## 1           Barry Plant Essendon\n                      
## 2 Sahara Real Estate - Truganina\n                      
## 3        Greg Hocking Holdsworth\n                      
## 4        Greg Hocking Holdsworth\n                      
## 5          Barry Plant - Newport\n                      
## 6     Greg Hocking Elly Partners\n

## [1] 550

#scrap date section using html_nodes
Date = html_nodes(webpage,'.col-auction-date')

#Converting the date to text
Date = html_text(Date)

#Converting date to data frame.
Date_df <- data.frame(Date)

#Let's filter out the rows with "Date"

Date_df <- filter(Date_df, Date != "Date")

#nrow to make sure all number of rows are the same.
head(Date_df); nrow(Date_df)

##       Date
## 1 21/07/18
## 2 21/07/18
## 3 21/07/18
## 4 21/07/18
## 5 21/07/18
## 6 21/07/18

## [1] 550

#scrap agent section using html_nodes
Agent = html_nodes(webpage,'.col-agent, .col-agent ellipsis')

#Converting the agent to text
Agent = html_text(Agent)

head(Agent)

## [1] "Agency"                                                
## [2] "Barry Plant Essendon\n                      "          
## [3] "Agency"                                                
## [4] "Sahara Real Estate - Truganina\n                      "
## [5] "Agency"                                                
## [6] "Greg Hocking Holdsworth\n                      "

#Let's do some data cleaning.
Agent=gsub("\n.*","",Agent)
head(Agent)

## [1] "Agency"                         "Barry Plant Essendon"          
## [3] "Agency"                         "Sahara Real Estate - Truganina"
## [5] "Agency"                         "Greg Hocking Holdsworth"

#Converting Agent to data frame.
agent_df <- data.frame(Agent)

#Let's filter out the rows with "Agency"

agent_df <- filter(agent_df, Agent != "Agency")

#nrow to make sure all number of rows are the same.
head(agent_df); nrow(agent_df)

##                            Agent
## 1           Barry Plant Essendon
## 2 Sahara Real Estate - Truganina
## 3        Greg Hocking Holdsworth
## 4        Greg Hocking Holdsworth
## 5          Barry Plant - Newport
## 6     Greg Hocking Elly Partners

## [1] 550

#scrap url section using html_nodes
URL_node = html_nodes(webpage,'.col-address')
URL_node

## {xml_nodeset (550)}
##  [1] <a href="/128722914" class="col-address">10A Watt St</a>
##  [2] <a href="/128597974" class="col-address">74 Evergreen Av</a>
##  [3] <a href="/128703422" class="col-address">66 Barrett St</a>
##  [4] <a href="/128683478" class="col-address">18 Page St</a>
##  [5] <a href="/128678506" class="col-address">2/8 Mitchell Av</a>
##  [6] <a href="/127400270" class="col-address">16 Allan St</a>
##  [7] <a href="/128643366" class="col-address">1/102 Maxweld St</a>
##  [8] <a href="/128703046" class="col-address">10/71 Denbigh Rd</a>
##  [9] <a href="/128711730" class="col-address">17A Ferguson St</a>
## [10] <a href="/128626506" class="col-address">9 Geddes St</a>
## [11] <a href="/128659922" class="col-address">1/6 Crete Av</a>
## [12] <div class="col-address">45 Fakenham Rd</div>
## [13] <a href="/128682954" class="col-address">1/211 Huntingdale Rd</a>
## [14] <a href="/128687990" class="col-address">7B Lawrence Av</a>
## [15] <a href="/128698282" class="col-address">17/5 Mcintosh Ct</a>
## [16] <a href="/128678562" class="col-address">16 The Sands</a>
## [17] <a href="/128598346" class="col-address">9 Sovereign Way</a>
## [18] <a href="/128659678" class="col-address">10 Doyle St</a>
## [19] <a href="/128702086" class="col-address">12/84 Westbury St</a>
## [20] <a href="/128701022" class="col-address">4/60 Sycamore Gv</a>
## ...

#As this is a hyperlink, make sure you add the attribute "href" to get the link url
URL <- html_attr(URL_node, "href")

#adding the whole URL name.
URL <- paste0("https://www.realestate.com.au",URL)
head(URL)

## [1] "https://www.realestate.com.au/128722914"
## [2] "https://www.realestate.com.au/128597974"
## [3] "https://www.realestate.com.au/128703422"
## [4] "https://www.realestate.com.au/128683478"
## [5] "https://www.realestate.com.au/128678506"
## [6] "https://www.realestate.com.au/127400270"

#Converting URL to data frame.
URL_df <- data.frame(URL)

#nrow to make sure all number of rows are the same.
head(URL_df); nrow(URL_df)

##                                       URL
## 1 https://www.realestate.com.au/128722914
## 2 https://www.realestate.com.au/128597974
## 3 https://www.realestate.com.au/128703422
## 4 https://www.realestate.com.au/128683478
## 5 https://www.realestate.com.au/128678506
## 6 https://www.realestate.com.au/127400270

## [1] 550

# merge all
AUCTION <- data.frame(address_df, price_df, beds_df, property_type_df, auction_result_df, Date_df, agent_df, URL_df)
nrow(AUCTION) # 550

## [1] 550

head(AUCTION)

##            Address       Suburb      Price Beds Property_Type
## 2      10A Watt St Airport West   $925,000    4         House
## 4  74 Evergreen Av    Albanvale          -    3         House
## 6    66 Barrett St  Albert Park $2,305,000    4         House
## 7       18 Page St  Albert Park $2,740,000    3         House
## 9  2/8 Mitchell Av Altona North   $685,000    3          Unit
## 10     16 Allan St Altona North          -    3         House
##            Auction_Result     Date                          Agent
## 2  Passed In - Vendor Bid 21/07/18           Barry Plant Essendon
## 4               Passed In 21/07/18 Sahara Real Estate - Truganina
## 6         Sold At Auction 21/07/18        Greg Hocking Holdsworth
## 7         Sold At Auction 21/07/18        Greg Hocking Holdsworth
## 9         Sold At Auction 21/07/18          Barry Plant - Newport
## 10              Passed In 21/07/18     Greg Hocking Elly Partners
##                                        URL
## 2  https://www.realestate.com.au/128722914
## 4  https://www.realestate.com.au/128597974
## 6  https://www.realestate.com.au/128703422
## 7  https://www.realestate.com.au/128683478
## 9  https://www.realestate.com.au/128678506
## 10 https://www.realestate.com.au/127400270

Webscraping Real Estate Data

Jenny Cheng

7/22/2018