Q1 (a)
# Load reader
library(readxl)
# Load dataset
property_market_dataset <- read_excel("~/Documents/MSc/Unit 1/property_market_dataset.xlsx")
View(property_market_dataset)
# Load packages and review dataset
summary(property_market_dataset)
## Suburb Rooms Type Price
## Length:34857 Min. : 1.000 Length:34857 Min. : 85000
## Class :character 1st Qu.: 2.000 Class :character 1st Qu.: 635000
## Mode :character Median : 3.000 Mode :character Median : 870000
## Mean : 3.031 Mean : 1050173
## 3rd Qu.: 4.000 3rd Qu.: 1295000
## Max. :16.000 Max. :11200000
## NA's :7610
## Method Distance Bedroom Bathroom
## Length:34857 Min. : 0.00 Min. : 0.000 Min. : 0.000
## Class :character 1st Qu.: 6.40 1st Qu.: 2.000 1st Qu.: 1.000
## Mode :character Median :10.30 Median : 3.000 Median : 2.000
## Mean :11.18 Mean : 3.085 Mean : 1.625
## 3rd Qu.:14.00 3rd Qu.: 4.000 3rd Qu.: 2.000
## Max. :48.10 Max. :30.000 Max. :12.000
## NA's :1 NA's :8217 NA's :8226
## Car Landsize BuildingArea YearBuilt
## Min. : 0.000 Min. : 0.0 Min. : 0.0 Min. :1196
## 1st Qu.: 1.000 1st Qu.: 224.0 1st Qu.: 102.0 1st Qu.:1940
## Median : 2.000 Median : 521.0 Median : 136.0 Median :1970
## Mean : 1.729 Mean : 593.6 Mean : 160.3 Mean :1965
## 3rd Qu.: 2.000 3rd Qu.: 670.0 3rd Qu.: 188.0 3rd Qu.:2000
## Max. :26.000 Max. :433014.0 Max. :44515.0 Max. :2106
## NA's :8728 NA's :11810 NA's :21115 NA's :19306
View(property_market_dataset)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(xray)
library(ggplot2)
library(skimr)
library(ggExtra)
skim(property_market_dataset)
| Name | property_market_dataset |
| Number of rows | 34857 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Suburb | 0 | 1 | 3 | 18 | 0 | 351 | 0 |
| Type | 0 | 1 | 1 | 1 | 0 | 3 | 0 |
| Method | 0 | 1 | 1 | 2 | 0 | 9 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Rooms | 0 | 1.00 | 3.03 | 0.97 | 1 | 2.0 | 3.0 | 4 | 16.0 | ▇▁▁▁▁ |
| Price | 7610 | 0.78 | 1050173.34 | 641467.13 | 85000 | 635000.0 | 870000.0 | 1295000 | 11200000.0 | ▇▁▁▁▁ |
| Distance | 1 | 1.00 | 11.18 | 6.79 | 0 | 6.4 | 10.3 | 14 | 48.1 | ▇▇▂▁▁ |
| Bedroom | 8217 | 0.76 | 3.08 | 0.98 | 0 | 2.0 | 3.0 | 4 | 30.0 | ▇▁▁▁▁ |
| Bathroom | 8226 | 0.76 | 1.62 | 0.72 | 0 | 1.0 | 2.0 | 2 | 12.0 | ▇▁▁▁▁ |
| Car | 8728 | 0.75 | 1.73 | 1.01 | 0 | 1.0 | 2.0 | 2 | 26.0 | ▇▁▁▁▁ |
| Landsize | 11810 | 0.66 | 593.60 | 3398.84 | 0 | 224.0 | 521.0 | 670 | 433014.0 | ▇▁▁▁▁ |
| BuildingArea | 21115 | 0.39 | 160.26 | 401.27 | 0 | 102.0 | 136.0 | 188 | 44515.0 | ▇▁▁▁▁ |
| YearBuilt | 19306 | 0.45 | 1965.29 | 37.33 | 1196 | 1940.0 | 1970.0 | 2000 | 2106.0 | ▁▁▁▂▇ |
xray::anomalies(property_market_dataset)
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## $variables
## Variable q qNA pNA qZero pZero qBlank pBlank qInf pInf
## 1 BuildingArea 34857 21115 60.58% 76 0.22% 0 - 0 -
## 2 YearBuilt 34857 19306 55.39% 0 - 0 - 0 -
## 3 Landsize 34857 11810 33.88% 2437 6.99% 0 - 0 -
## 4 Car 34857 8728 25.04% 1631 4.68% 0 - 0 -
## 5 Bathroom 34857 8226 23.6% 46 0.13% 0 - 0 -
## 6 Bedroom 34857 8217 23.57% 17 0.05% 0 - 0 -
## 7 Price 34857 7610 21.83% 0 - 0 - 0 -
## 8 Distance 34857 1 0% 77 0.22% 0 - 0 -
## 9 Type 34857 0 - 0 - 0 - 0 -
## 10 Method 34857 0 - 0 - 0 - 0 -
## 11 Rooms 34857 0 - 0 - 0 - 0 -
## 12 Suburb 34857 0 - 0 - 0 - 0 -
## qDistinct type anomalous_percent
## 1 741 Numeric 60.79%
## 2 161 Numeric 55.39%
## 3 1685 Numeric 40.87%
## 4 16 Numeric 29.72%
## 5 12 Numeric 23.73%
## 6 16 Numeric 23.62%
## 7 2872 Numeric 21.83%
## 8 216 Numeric 0.22%
## 9 3 Character -
## 10 9 Character -
## 11 12 Numeric -
## 12 351 Character -
##
## $problem_variables
## [1] Variable q qNA pNA
## [5] qZero pZero qBlank pBlank
## [9] qInf pInf qDistinct type
## [13] anomalous_percent problems
## <0 rows> (or 0-length row.names)
summary(property_market_dataset)
## Suburb Rooms Type Price
## Length:34857 Min. : 1.000 Length:34857 Min. : 85000
## Class :character 1st Qu.: 2.000 Class :character 1st Qu.: 635000
## Mode :character Median : 3.000 Mode :character Median : 870000
## Mean : 3.031 Mean : 1050173
## 3rd Qu.: 4.000 3rd Qu.: 1295000
## Max. :16.000 Max. :11200000
## NA's :7610
## Method Distance Bedroom Bathroom
## Length:34857 Min. : 0.00 Min. : 0.000 Min. : 0.000
## Class :character 1st Qu.: 6.40 1st Qu.: 2.000 1st Qu.: 1.000
## Mode :character Median :10.30 Median : 3.000 Median : 2.000
## Mean :11.18 Mean : 3.085 Mean : 1.625
## 3rd Qu.:14.00 3rd Qu.: 4.000 3rd Qu.: 2.000
## Max. :48.10 Max. :30.000 Max. :12.000
## NA's :1 NA's :8217 NA's :8226
## Car Landsize BuildingArea YearBuilt
## Min. : 0.000 Min. : 0.0 Min. : 0.0 Min. :1196
## 1st Qu.: 1.000 1st Qu.: 224.0 1st Qu.: 102.0 1st Qu.:1940
## Median : 2.000 Median : 521.0 Median : 136.0 Median :1970
## Mean : 1.729 Mean : 593.6 Mean : 160.3 Mean :1965
## 3rd Qu.: 2.000 3rd Qu.: 670.0 3rd Qu.: 188.0 3rd Qu.:2000
## Max. :26.000 Max. :433014.0 Max. :44515.0 Max. :2106
## NA's :8728 NA's :11810 NA's :21115 NA's :19306
Observing that there are significant number of NA values for BuildingArea, YearBuild, Landsize, bathrooms and price which would all have a value. As the primary purpose of this analysis is to explore house prices, rows with a Price NA values will be excluded from the dataset for further analysis. Rows that include NA values for BuildingArea, YearBuild, Landsize, bathrooms will still be retained as there remains the potential to conduct analysis ignoring these values for reporting on other correlations. There are also several NA or blank values to car and bedrooms. As it is feasible that some of the properties are studios and/or households not have a vehicle, these fields will be retained and explored further. For example studios included in Type U or NA cars with lower Distance to the CBD. YearBuilt has a minimum data field of 1196 which is very unlikely and appears to be an outlier. There are also YearBuilt years set significantly in the future and is unlikley for off-plan purchases.
Q1 (b)
# Prepare initial columns as objects
yearbuilt<-property_market_dataset$YearBuilt
buildingarea <-
property_market_dataset$BuildingArea
bathroom <-property_market_dataset$Bathroom
# Filter dataframe for equal to 1 or more Bathroom.
property_market_ii <- filter(property_market_dataset, bathroom>=1)
# Remove all NA values from dataframe
property_market_iii <- na.omit(property_market_ii)
# Prepare objects.
suburb<- property_market_iii$Suburb
type<-property_market_iii$Type
method<-property_market_iii$Method
price<-property_market_iii$Price
distance <-property_market_iii$Distance
bedroom<-property_market_iii$Bedroom
car<-property_market_iii$Car
landsize<-property_market_iii$Landsize
buildingareat <- property_market_iii$BuildingArea
bathroomt<-property_market_iii$Bathroom
yearbuildt <-property_market_iii$YearBuilt
rooms<-property_market_iii$Rooms
Line plot price vs landsize to explore whether the larger plots of land would demand a larger property price. Both are continuous quantitative data variables which the line plot maps effectively.
g1 <- ggplot()+ geom_line(aes(y= price, x= landsize), data = property_market_iii)+ labs (title = "Property Price vs Landsize", x="Landsize", y = "Property Price")
g1
Scatterplot exploring property price in relation to year built with property price on the x axis for horizontal point mapping. This graph layout identifies concentrations of data points (trends), outliers to potentially filter in further analysis. Both are continuous quantitative data variables to plot effectively.
g2 <- ggplot() + geom_point(aes(y= yearbuildt, x= price), data = property_market_iii)+ labs (title = "Property Price vs Year Built", y="Year Built", x = "Property Price")
g2
Plot price vs bedroom to explore potenial trends of whether property price increases in correlation to an increase of bedrooms. This graph will help identify outliers to remove for further analysis. As the bedroom data are discrete quantitative variables they require counting which graph summarises well the non-continuous data.
g3 <- ggplot() + geom_count(aes(y= bedroom, x= price), data = property_market_iii)+ labs (title = "Property Price vs Bedroom(s)", y="Bedroom(s)", x = "Property Price", theme_classic())
g3
Plot price vs number of bedroom(s) and bathroom(s) within the property using a contoured graph that allows three variables. As both bathrooms and bathrooms are discrete quantitative variables this graph lends itself well with the continuous price data.
g4 <-ggplot()+ geom_contour_filled(aes(z=bedroom, x = bathroomt,y=price), data=property_market_iii) + labs (title = "Property Price with Bedroom(s) and Bathroom(s) Visualisation", y ="Property Price", x= "Bathroom(s)")
g4
##Q2 (b)
Exploratory question 1: Which properties have sold according to the dataset?
sold <-filter(property_market_iii, method=="S"| method=="SP"|method=="SN"|method=="SA"|method=="SS")
sold
## # A tibble: 6,963 × 12
## Suburb Rooms Type Price Method Distance Bedroom Bathroom Car Landsize
## <chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Albion 1 u 185000 S 13.9 1 1 1 2347
## 2 Kingsville 1 u 210000 S 7.8 1 1 1 0
## 3 Footscray 1 u 216000 S 6.4 1 1 1 0
## 4 Reservoir 1 u 216000 S 11.2 2 1 1 0
## 5 Reservoir 1 u 220000 S 11.2 1 1 1 0
## 6 Moonee Po… 1 u 222000 S 6.2 1 1 0 0
## 7 Footscray 1 u 227000 S 6.4 1 1 1 0
## 8 Footscray 1 u 240000 SA 5.1 1 1 1 1015
## 9 Coburg 1 u 241000 S 7.8 1 1 1 0
## 10 Pascoe Va… 1 u 248500 S 9.9 1 1 1 49
## # … with 6,953 more rows, and 2 more variables: BuildingArea <dbl>,
## # YearBuilt <dbl>
Exploratory question 2: What twenty five suburbs have the highest number of properties sold within the dataset?
suburb_table <- sort(table(sold$Suburb), decreasing = T)
suburb_table
##
## Reservoir Richmond Brunswick Bentleigh East
## 160 120 117 110
## Coburg Preston Glenroy Essendon
## 108 97 92 89
## Craigieburn Yarraville Pascoe Vale Newport
## 85 85 84 81
## Hawthorn Northcote Thornbury Footscray
## 76 76 76 75
## Moonee Ponds Carnegie Port Melbourne Ascot Vale
## 75 68 68 67
## Glen Iris Brighton Keilor East Elwood
## 67 62 62 61
## Williamstown Kew South Yarra Bentleigh
## 61 60 60 59
## Camberwell Brighton East Malvern East St Kilda
## 58 57 57 57
## Fawkner Maribyrnong Epping Werribee
## 56 56 55 54
## Kensington Brunswick West Prahran Sunshine West
## 53 52 52 52
## Mill Park West Footscray Balwyn North Doncaster
## 51 50 48 47
## Sunbury Brunswick East Maidstone Hampton
## 46 44 43 42
## Hoppers Crossing Hawthorn East Ivanhoe Templestowe Lower
## 42 41 41 41
## Burwood Fitzroy North Sunshine Surrey Hills
## 40 40 40 40
## Collingwood North Melbourne Airport West Ormond
## 39 39 38 37
## Armadale Balwyn Moorabbin Ashburton
## 36 36 36 35
## Hadfield Strathmore Abbotsford Glen Waverley
## 35 35 33 33
## Seddon Coburg North Elsternwick Heidelberg Heights
## 33 32 32 32
## Point Cook Altona Lalor Niddrie
## 32 31 31 31
## South Morang Altona North Avondale Heights Box Hill
## 31 30 30 30
## Cheltenham Flemington Oakleigh South Bundoora
## 30 30 30 29
## Mount Waverley Murrumbeena South Melbourne Ashwood
## 29 29 29 28
## Braybrook Greensborough Malvern Caulfield South
## 28 28 28 27
## Greenvale Oak Park Clifton Hill Fitzroy
## 27 27 26 26
## Sunshine North Watsonia Toorak Aberfeldie
## 26 26 25 24
## Bulleen Doncaster East Frankston Mernda
## 24 24 24 24
## Roxburgh Park Albion Croydon Melton South
## 24 23 23 23
## Albert Park Heidelberg West Kingsville Chadstone
## 22 22 22 21
## Hughesdale Rosanna Wantirna South Fairfield
## 21 21 21 20
## Mulgrave Thomastown Windsor Gowanbrae
## 20 20 20 19
## Hillside Melbourne Middle Park Oakleigh
## 19 19 19 19
## Sandringham Spotswood Viewbank Broadmeadows
## 19 19 19 18
## Canterbury Ferntree Gully Jacana Meadow Heights
## 18 18 18 18
## Mentone St Albans Yallambie Heidelberg
## 18 18 18 17
## Parkdale West Melbourne Dingley Village Eltham
## 17 17 16 16
## Mitcham Seaford Wollert Carlton
## 16 16 16 15
## Hampton East Kealba Kew East Taylors Hill
## 15 15 15 15
## Alphington Doreen Forest Hill Gladstone Park
## 14 14 14 14
## Highett Keilor Park Southbank Templestowe
## 14 14 14 14
## Balaclava Bayswater Carlton North Carrum Downs
## 13 13 13 13
## Dandenong North Melton Melton West Nunawading
## 13 13 13 13
## Altona Meadows Beaumaris Boronia Dallas
## 12 12 12 12
## Donvale Essendon North Frankston South Heathmont
## 12 12 12 12
## Wyndham Vale Caulfield North Cremorne Eaglemont
## 12 11 11 11
## Ringwood Ringwood East Rowville South Kingsville
## 11 11 11 11
## Taylors Lakes Tullamarine Vermont Wantirna
## 11 11 11 11
## Blackburn South Keilor Downs Mont Albert Montmorency
## 10 10 10 10
## Parkville Tarneit Westmeadows Berwick
## 10 10 10 9
## Blackburn Caroline Springs Clayton Mooroolbark
## 9 9 9 9
## Mordialloc Noble Park Chelsea Heights Clayton South
## 9 9 8 8
## Gisborne Glen Huntly Ivanhoe East Keysborough
## 8 8 8 8
## Oakleigh East Truganina Wheelers Hill Bellfield
## 8 8 8 7
## Blackburn North Bonbeach Briar Hill Burwood East
## 7 7 7 7
## Croydon North Dandenong Delahey Edithvale
## 7 7 7 7
## Endeavour Hills Essendon West Kilsyth Kurunjang
## 7 7 7 7
## Pakenham Watsonia North Williamstown North Burnley
## 7 7 7 6
## Clarinda Deer Park Diamond Creek Doveton
## 6 6 6 6
## Eltham North Frankston North Kings Park Scoresby
## 6 6 6 6
## Springvale Travancore Vermont South Ardeer
## 6 6 6 5
## Aspendale Brooklyn Carrum Coolaroo
## 5 5 5 5
## East Melbourne Keilor Kingsbury Bayswater North
## 5 5 5 4
## Black Rock Burnside Burnside Heights Gardenvale
## 4 4 4 4
## McKinnon Narre Warren Ringwood North Ripponlea
## 4 4 4 4
## Sydenham Wallan Warrandyte Williams Landing
## 4 4 4 4
## Albanvale Attwood Chelsea Croydon Hills
## 3 3 3 3
## Croydon South Derrimut Diggers Rest Huntingdale
## 3 3 3 3
## Knoxfield Lower Plenty Mickleham Notting Hill
## 3 3 3 3
## Seabrook Strathmore Heights Aspendale Gardens Brookfield
## 3 3 2 2
## Cairnlea Caulfield East Chirnside Park Cranbourne
## 2 2 2 2
## Deepdene Emerald Hallam Langwarrin
## 2 2 2 2
## Mount Evelyn North Warrandyte Princes Hill Sandhurst
## 2 2 2 2
## Seaholme Skye St Helena Bacchus Marsh
## 2 2 2 1
## Beaconsfield Beaconsfield Upper Botanic Ridge Bullengarook
## 1 1 1 1
## Campbellfield Caulfield Cranbourne North Gisborne South
## 1 1 1 1
## Hampton Park Healesville Hurstbridge Keilor Lodge
## 1 1 1 1
## Kooyong Montrose Officer Plumpton
## 1 1 1 1
## Research Riddells Creek The Basin Upwey
## 1 1 1 1
## Waterways Wattle Glen Whittlesea Yarra Glen
## 1 1 1 1
top_suburbs <- suburb_table[1:25]
top_suburbs
##
## Reservoir Richmond Brunswick Bentleigh East Coburg
## 160 120 117 110 108
## Preston Glenroy Essendon Craigieburn Yarraville
## 97 92 89 85 85
## Pascoe Vale Newport Hawthorn Northcote Thornbury
## 84 81 76 76 76
## Footscray Moonee Ponds Carnegie Port Melbourne Ascot Vale
## 75 75 68 68 67
## Glen Iris Brighton Keilor East Elwood Williamstown
## 67 62 62 61 61
Exploratory question 3: What is the average house price for properties sold 25km or less from the Central Business District?
cbd_mean <- sold %>% filter(Distance<=25) %>% summarise(mean= mean(Price))
cbd_mean
## # A tibble: 1 × 1
## mean
## <dbl>
## 1 1077674.
Q3 Recommendations for the estate agents would include prioritising generating clients within the Central Business District (CBD) as property sales yield on average 70% more if sold at or within a 25 km radius. Specifically exploring generating new listings and sales of houses as the number of houses sold are signifiantly higher than other property types according to the dataset. Houses also yield a higher listing/ sale price. However, it is recommended that the estate agents explore properties under development as these data records were incomplete and therefore not included in the analysis.
# Visualisation of property prices by distance from the CBD.
distance_gg <-ggplot()+ geom_line(aes(y= price, x= distance), data = property_market_iii)+ labs (title = "Property Price vs Distance from CBD", x ="Distance", y = "Property Price")
distance_gg
# Visualisation of property prices by the type of property.
price_type <- ggplot()+ geom_boxplot(aes(y= price, x= type), data = property_market_iii)+ labs (title = "Property Price vs Type of Property", x ="Property Type", y = "Property Price")
price_type
# Visualisation of types of property by the number listed and sold.
type_no_gg <- ggplot()+ geom_bar(aes(type), data = property_market_iii)+ labs (title = "Number of Properties by Type", x ="Property Type")
type_no_gg