Q1 (a)

# Load reader
library(readxl)
# Load dataset
property_market_dataset <- read_excel("~/Documents/MSc/Unit 1/property_market_dataset.xlsx")
View(property_market_dataset)
# Load packages and review dataset
summary(property_market_dataset)
##     Suburb              Rooms            Type               Price         
##  Length:34857       Min.   : 1.000   Length:34857       Min.   :   85000  
##  Class :character   1st Qu.: 2.000   Class :character   1st Qu.:  635000  
##  Mode  :character   Median : 3.000   Mode  :character   Median :  870000  
##                     Mean   : 3.031                      Mean   : 1050173  
##                     3rd Qu.: 4.000                      3rd Qu.: 1295000  
##                     Max.   :16.000                      Max.   :11200000  
##                                                         NA's   :7610      
##     Method             Distance        Bedroom          Bathroom     
##  Length:34857       Min.   : 0.00   Min.   : 0.000   Min.   : 0.000  
##  Class :character   1st Qu.: 6.40   1st Qu.: 2.000   1st Qu.: 1.000  
##  Mode  :character   Median :10.30   Median : 3.000   Median : 2.000  
##                     Mean   :11.18   Mean   : 3.085   Mean   : 1.625  
##                     3rd Qu.:14.00   3rd Qu.: 4.000   3rd Qu.: 2.000  
##                     Max.   :48.10   Max.   :30.000   Max.   :12.000  
##                     NA's   :1       NA's   :8217     NA's   :8226    
##       Car            Landsize         BuildingArea       YearBuilt    
##  Min.   : 0.000   Min.   :     0.0   Min.   :    0.0   Min.   :1196   
##  1st Qu.: 1.000   1st Qu.:   224.0   1st Qu.:  102.0   1st Qu.:1940   
##  Median : 2.000   Median :   521.0   Median :  136.0   Median :1970   
##  Mean   : 1.729   Mean   :   593.6   Mean   :  160.3   Mean   :1965   
##  3rd Qu.: 2.000   3rd Qu.:   670.0   3rd Qu.:  188.0   3rd Qu.:2000   
##  Max.   :26.000   Max.   :433014.0   Max.   :44515.0   Max.   :2106   
##  NA's   :8728     NA's   :11810      NA's   :21115     NA's   :19306
View(property_market_dataset)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(xray)
library(ggplot2)
library(skimr)
library(ggExtra)
skim(property_market_dataset)
Data summary
Name property_market_dataset
Number of rows 34857
Number of columns 12
_______________________
Column type frequency:
character 3
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Suburb 0 1 3 18 0 351 0
Type 0 1 1 1 0 3 0
Method 0 1 1 2 0 9 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Rooms 0 1.00 3.03 0.97 1 2.0 3.0 4 16.0 ▇▁▁▁▁
Price 7610 0.78 1050173.34 641467.13 85000 635000.0 870000.0 1295000 11200000.0 ▇▁▁▁▁
Distance 1 1.00 11.18 6.79 0 6.4 10.3 14 48.1 ▇▇▂▁▁
Bedroom 8217 0.76 3.08 0.98 0 2.0 3.0 4 30.0 ▇▁▁▁▁
Bathroom 8226 0.76 1.62 0.72 0 1.0 2.0 2 12.0 ▇▁▁▁▁
Car 8728 0.75 1.73 1.01 0 1.0 2.0 2 26.0 ▇▁▁▁▁
Landsize 11810 0.66 593.60 3398.84 0 224.0 521.0 670 433014.0 ▇▁▁▁▁
BuildingArea 21115 0.39 160.26 401.27 0 102.0 136.0 188 44515.0 ▇▁▁▁▁
YearBuilt 19306 0.45 1965.29 37.33 1196 1940.0 1970.0 2000 2106.0 ▁▁▁▂▇
xray::anomalies(property_market_dataset)
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## $variables
##        Variable     q   qNA    pNA qZero pZero qBlank pBlank qInf pInf
## 1  BuildingArea 34857 21115 60.58%    76 0.22%      0      -    0    -
## 2     YearBuilt 34857 19306 55.39%     0     -      0      -    0    -
## 3      Landsize 34857 11810 33.88%  2437 6.99%      0      -    0    -
## 4           Car 34857  8728 25.04%  1631 4.68%      0      -    0    -
## 5      Bathroom 34857  8226  23.6%    46 0.13%      0      -    0    -
## 6       Bedroom 34857  8217 23.57%    17 0.05%      0      -    0    -
## 7         Price 34857  7610 21.83%     0     -      0      -    0    -
## 8      Distance 34857     1     0%    77 0.22%      0      -    0    -
## 9          Type 34857     0      -     0     -      0      -    0    -
## 10       Method 34857     0      -     0     -      0      -    0    -
## 11        Rooms 34857     0      -     0     -      0      -    0    -
## 12       Suburb 34857     0      -     0     -      0      -    0    -
##    qDistinct      type anomalous_percent
## 1        741   Numeric            60.79%
## 2        161   Numeric            55.39%
## 3       1685   Numeric            40.87%
## 4         16   Numeric            29.72%
## 5         12   Numeric            23.73%
## 6         16   Numeric            23.62%
## 7       2872   Numeric            21.83%
## 8        216   Numeric             0.22%
## 9          3 Character                 -
## 10         9 Character                 -
## 11        12   Numeric                 -
## 12       351 Character                 -
## 
## $problem_variables
##  [1] Variable          q                 qNA               pNA              
##  [5] qZero             pZero             qBlank            pBlank           
##  [9] qInf              pInf              qDistinct         type             
## [13] anomalous_percent problems         
## <0 rows> (or 0-length row.names)
summary(property_market_dataset)
##     Suburb              Rooms            Type               Price         
##  Length:34857       Min.   : 1.000   Length:34857       Min.   :   85000  
##  Class :character   1st Qu.: 2.000   Class :character   1st Qu.:  635000  
##  Mode  :character   Median : 3.000   Mode  :character   Median :  870000  
##                     Mean   : 3.031                      Mean   : 1050173  
##                     3rd Qu.: 4.000                      3rd Qu.: 1295000  
##                     Max.   :16.000                      Max.   :11200000  
##                                                         NA's   :7610      
##     Method             Distance        Bedroom          Bathroom     
##  Length:34857       Min.   : 0.00   Min.   : 0.000   Min.   : 0.000  
##  Class :character   1st Qu.: 6.40   1st Qu.: 2.000   1st Qu.: 1.000  
##  Mode  :character   Median :10.30   Median : 3.000   Median : 2.000  
##                     Mean   :11.18   Mean   : 3.085   Mean   : 1.625  
##                     3rd Qu.:14.00   3rd Qu.: 4.000   3rd Qu.: 2.000  
##                     Max.   :48.10   Max.   :30.000   Max.   :12.000  
##                     NA's   :1       NA's   :8217     NA's   :8226    
##       Car            Landsize         BuildingArea       YearBuilt    
##  Min.   : 0.000   Min.   :     0.0   Min.   :    0.0   Min.   :1196   
##  1st Qu.: 1.000   1st Qu.:   224.0   1st Qu.:  102.0   1st Qu.:1940   
##  Median : 2.000   Median :   521.0   Median :  136.0   Median :1970   
##  Mean   : 1.729   Mean   :   593.6   Mean   :  160.3   Mean   :1965   
##  3rd Qu.: 2.000   3rd Qu.:   670.0   3rd Qu.:  188.0   3rd Qu.:2000   
##  Max.   :26.000   Max.   :433014.0   Max.   :44515.0   Max.   :2106   
##  NA's   :8728     NA's   :11810      NA's   :21115     NA's   :19306

Observing that there are significant number of NA values for BuildingArea, YearBuild, Landsize, bathrooms and price which would all have a value. As the primary purpose of this analysis is to explore house prices, rows with a Price NA values will be excluded from the dataset for further analysis. Rows that include NA values for BuildingArea, YearBuild, Landsize, bathrooms will still be retained as there remains the potential to conduct analysis ignoring these values for reporting on other correlations. There are also several NA or blank values to car and bedrooms. As it is feasible that some of the properties are studios and/or households not have a vehicle, these fields will be retained and explored further. For example studios included in Type U or NA cars with lower Distance to the CBD. YearBuilt has a minimum data field of 1196 which is very unlikely and appears to be an outlier. There are also YearBuilt years set significantly in the future and is unlikley for off-plan purchases.

Q1 (b)

# Prepare initial columns as objects
yearbuilt<-property_market_dataset$YearBuilt
buildingarea <-
property_market_dataset$BuildingArea
bathroom <-property_market_dataset$Bathroom
# Filter dataframe for equal to 1 or more Bathroom.
property_market_ii <- filter(property_market_dataset, bathroom>=1)
# Remove all NA values from dataframe
property_market_iii <- na.omit(property_market_ii)
# Prepare objects.
suburb<- property_market_iii$Suburb 
type<-property_market_iii$Type
method<-property_market_iii$Method 
price<-property_market_iii$Price 
distance <-property_market_iii$Distance
bedroom<-property_market_iii$Bedroom 
car<-property_market_iii$Car
landsize<-property_market_iii$Landsize 
buildingareat <- property_market_iii$BuildingArea
bathroomt<-property_market_iii$Bathroom 
yearbuildt <-property_market_iii$YearBuilt
rooms<-property_market_iii$Rooms

Q2 (a)

Line plot price vs landsize to explore whether the larger plots of land would demand a larger property price. Both are continuous quantitative data variables which the line plot maps effectively.

g1 <- ggplot()+ geom_line(aes(y= price, x= landsize), data = property_market_iii)+ labs (title = "Property Price vs Landsize", x="Landsize", y = "Property Price")
g1

Scatterplot exploring property price in relation to year built with property price on the x axis for horizontal point mapping. This graph layout identifies concentrations of data points (trends), outliers to potentially filter in further analysis. Both are continuous quantitative data variables to plot effectively.

g2 <- ggplot() + geom_point(aes(y= yearbuildt, x= price), data = property_market_iii)+ labs (title = "Property Price vs Year Built", y="Year Built", x = "Property Price")
g2

Plot price vs bedroom to explore potenial trends of whether property price increases in correlation to an increase of bedrooms. This graph will help identify outliers to remove for further analysis. As the bedroom data are discrete quantitative variables they require counting which graph summarises well the non-continuous data.

g3 <- ggplot() + geom_count(aes(y= bedroom, x= price), data = property_market_iii)+ labs (title = "Property Price vs Bedroom(s)", y="Bedroom(s)", x = "Property Price", theme_classic())
g3

Plot price vs number of bedroom(s) and bathroom(s) within the property using a contoured graph that allows three variables. As both bathrooms and bathrooms are discrete quantitative variables this graph lends itself well with the continuous price data.

g4 <-ggplot()+ geom_contour_filled(aes(z=bedroom, x = bathroomt,y=price), data=property_market_iii) + labs (title = "Property Price with Bedroom(s) and Bathroom(s) Visualisation", y ="Property Price", x= "Bathroom(s)")
g4

##Q2 (b)

Exploratory question 1: Which properties have sold according to the dataset?

sold <-filter(property_market_iii, method=="S"| method=="SP"|method=="SN"|method=="SA"|method=="SS")
sold
## # A tibble: 6,963 × 12
##    Suburb     Rooms Type   Price Method Distance Bedroom Bathroom   Car Landsize
##    <chr>      <dbl> <chr>  <dbl> <chr>     <dbl>   <dbl>    <dbl> <dbl>    <dbl>
##  1 Albion         1 u     185000 S          13.9       1        1     1     2347
##  2 Kingsville     1 u     210000 S           7.8       1        1     1        0
##  3 Footscray      1 u     216000 S           6.4       1        1     1        0
##  4 Reservoir      1 u     216000 S          11.2       2        1     1        0
##  5 Reservoir      1 u     220000 S          11.2       1        1     1        0
##  6 Moonee Po…     1 u     222000 S           6.2       1        1     0        0
##  7 Footscray      1 u     227000 S           6.4       1        1     1        0
##  8 Footscray      1 u     240000 SA          5.1       1        1     1     1015
##  9 Coburg         1 u     241000 S           7.8       1        1     1        0
## 10 Pascoe Va…     1 u     248500 S           9.9       1        1     1       49
## # … with 6,953 more rows, and 2 more variables: BuildingArea <dbl>,
## #   YearBuilt <dbl>

Exploratory question 2: What twenty five suburbs have the highest number of properties sold within the dataset?

suburb_table <- sort(table(sold$Suburb), decreasing = T)
suburb_table
## 
##          Reservoir           Richmond          Brunswick     Bentleigh East 
##                160                120                117                110 
##             Coburg            Preston            Glenroy           Essendon 
##                108                 97                 92                 89 
##        Craigieburn         Yarraville        Pascoe Vale            Newport 
##                 85                 85                 84                 81 
##           Hawthorn          Northcote          Thornbury          Footscray 
##                 76                 76                 76                 75 
##       Moonee Ponds           Carnegie     Port Melbourne         Ascot Vale 
##                 75                 68                 68                 67 
##          Glen Iris           Brighton        Keilor East             Elwood 
##                 67                 62                 62                 61 
##       Williamstown                Kew        South Yarra          Bentleigh 
##                 61                 60                 60                 59 
##         Camberwell      Brighton East       Malvern East           St Kilda 
##                 58                 57                 57                 57 
##            Fawkner        Maribyrnong             Epping           Werribee 
##                 56                 56                 55                 54 
##         Kensington     Brunswick West            Prahran      Sunshine West 
##                 53                 52                 52                 52 
##          Mill Park     West Footscray       Balwyn North          Doncaster 
##                 51                 50                 48                 47 
##            Sunbury     Brunswick East          Maidstone            Hampton 
##                 46                 44                 43                 42 
##   Hoppers Crossing      Hawthorn East            Ivanhoe  Templestowe Lower 
##                 42                 41                 41                 41 
##            Burwood      Fitzroy North           Sunshine       Surrey Hills 
##                 40                 40                 40                 40 
##        Collingwood    North Melbourne       Airport West             Ormond 
##                 39                 39                 38                 37 
##           Armadale             Balwyn          Moorabbin          Ashburton 
##                 36                 36                 36                 35 
##           Hadfield         Strathmore         Abbotsford      Glen Waverley 
##                 35                 35                 33                 33 
##             Seddon       Coburg North        Elsternwick Heidelberg Heights 
##                 33                 32                 32                 32 
##         Point Cook             Altona              Lalor            Niddrie 
##                 32                 31                 31                 31 
##       South Morang       Altona North   Avondale Heights           Box Hill 
##                 31                 30                 30                 30 
##         Cheltenham         Flemington     Oakleigh South           Bundoora 
##                 30                 30                 30                 29 
##     Mount Waverley        Murrumbeena    South Melbourne            Ashwood 
##                 29                 29                 29                 28 
##          Braybrook      Greensborough            Malvern    Caulfield South 
##                 28                 28                 28                 27 
##          Greenvale           Oak Park       Clifton Hill            Fitzroy 
##                 27                 27                 26                 26 
##     Sunshine North           Watsonia             Toorak         Aberfeldie 
##                 26                 26                 25                 24 
##            Bulleen     Doncaster East          Frankston             Mernda 
##                 24                 24                 24                 24 
##      Roxburgh Park             Albion            Croydon       Melton South 
##                 24                 23                 23                 23 
##        Albert Park    Heidelberg West         Kingsville          Chadstone 
##                 22                 22                 22                 21 
##         Hughesdale            Rosanna     Wantirna South          Fairfield 
##                 21                 21                 21                 20 
##           Mulgrave         Thomastown            Windsor          Gowanbrae 
##                 20                 20                 20                 19 
##           Hillside          Melbourne        Middle Park           Oakleigh 
##                 19                 19                 19                 19 
##        Sandringham          Spotswood           Viewbank       Broadmeadows 
##                 19                 19                 19                 18 
##         Canterbury     Ferntree Gully             Jacana     Meadow Heights 
##                 18                 18                 18                 18 
##            Mentone          St Albans          Yallambie         Heidelberg 
##                 18                 18                 18                 17 
##           Parkdale     West Melbourne    Dingley Village             Eltham 
##                 17                 17                 16                 16 
##            Mitcham            Seaford            Wollert            Carlton 
##                 16                 16                 16                 15 
##       Hampton East             Kealba           Kew East       Taylors Hill 
##                 15                 15                 15                 15 
##         Alphington             Doreen        Forest Hill     Gladstone Park 
##                 14                 14                 14                 14 
##            Highett        Keilor Park          Southbank        Templestowe 
##                 14                 14                 14                 14 
##          Balaclava          Bayswater      Carlton North       Carrum Downs 
##                 13                 13                 13                 13 
##    Dandenong North             Melton        Melton West         Nunawading 
##                 13                 13                 13                 13 
##     Altona Meadows          Beaumaris            Boronia             Dallas 
##                 12                 12                 12                 12 
##            Donvale     Essendon North    Frankston South          Heathmont 
##                 12                 12                 12                 12 
##       Wyndham Vale    Caulfield North           Cremorne          Eaglemont 
##                 12                 11                 11                 11 
##           Ringwood      Ringwood East           Rowville   South Kingsville 
##                 11                 11                 11                 11 
##      Taylors Lakes        Tullamarine            Vermont           Wantirna 
##                 11                 11                 11                 11 
##    Blackburn South       Keilor Downs        Mont Albert        Montmorency 
##                 10                 10                 10                 10 
##          Parkville            Tarneit        Westmeadows            Berwick 
##                 10                 10                 10                  9 
##          Blackburn   Caroline Springs            Clayton        Mooroolbark 
##                  9                  9                  9                  9 
##         Mordialloc         Noble Park    Chelsea Heights      Clayton South 
##                  9                  9                  8                  8 
##           Gisborne        Glen Huntly       Ivanhoe East        Keysborough 
##                  8                  8                  8                  8 
##      Oakleigh East          Truganina      Wheelers Hill          Bellfield 
##                  8                  8                  8                  7 
##    Blackburn North           Bonbeach         Briar Hill       Burwood East 
##                  7                  7                  7                  7 
##      Croydon North          Dandenong            Delahey          Edithvale 
##                  7                  7                  7                  7 
##    Endeavour Hills      Essendon West            Kilsyth          Kurunjang 
##                  7                  7                  7                  7 
##           Pakenham     Watsonia North Williamstown North            Burnley 
##                  7                  7                  7                  6 
##           Clarinda          Deer Park      Diamond Creek            Doveton 
##                  6                  6                  6                  6 
##       Eltham North    Frankston North         Kings Park           Scoresby 
##                  6                  6                  6                  6 
##         Springvale         Travancore      Vermont South             Ardeer 
##                  6                  6                  6                  5 
##          Aspendale           Brooklyn             Carrum           Coolaroo 
##                  5                  5                  5                  5 
##     East Melbourne             Keilor          Kingsbury    Bayswater North 
##                  5                  5                  5                  4 
##         Black Rock           Burnside   Burnside Heights         Gardenvale 
##                  4                  4                  4                  4 
##           McKinnon       Narre Warren     Ringwood North          Ripponlea 
##                  4                  4                  4                  4 
##           Sydenham             Wallan         Warrandyte   Williams Landing 
##                  4                  4                  4                  4 
##          Albanvale            Attwood            Chelsea      Croydon Hills 
##                  3                  3                  3                  3 
##      Croydon South           Derrimut       Diggers Rest        Huntingdale 
##                  3                  3                  3                  3 
##          Knoxfield       Lower Plenty          Mickleham       Notting Hill 
##                  3                  3                  3                  3 
##           Seabrook Strathmore Heights  Aspendale Gardens         Brookfield 
##                  3                  3                  2                  2 
##           Cairnlea     Caulfield East     Chirnside Park         Cranbourne 
##                  2                  2                  2                  2 
##           Deepdene            Emerald             Hallam         Langwarrin 
##                  2                  2                  2                  2 
##       Mount Evelyn   North Warrandyte       Princes Hill          Sandhurst 
##                  2                  2                  2                  2 
##           Seaholme               Skye          St Helena      Bacchus Marsh 
##                  2                  2                  2                  1 
##       Beaconsfield Beaconsfield Upper      Botanic Ridge       Bullengarook 
##                  1                  1                  1                  1 
##      Campbellfield          Caulfield   Cranbourne North     Gisborne South 
##                  1                  1                  1                  1 
##       Hampton Park        Healesville        Hurstbridge       Keilor Lodge 
##                  1                  1                  1                  1 
##            Kooyong           Montrose            Officer           Plumpton 
##                  1                  1                  1                  1 
##           Research     Riddells Creek          The Basin              Upwey 
##                  1                  1                  1                  1 
##          Waterways        Wattle Glen         Whittlesea         Yarra Glen 
##                  1                  1                  1                  1
top_suburbs <- suburb_table[1:25]
top_suburbs 
## 
##      Reservoir       Richmond      Brunswick Bentleigh East         Coburg 
##            160            120            117            110            108 
##        Preston        Glenroy       Essendon    Craigieburn     Yarraville 
##             97             92             89             85             85 
##    Pascoe Vale        Newport       Hawthorn      Northcote      Thornbury 
##             84             81             76             76             76 
##      Footscray   Moonee Ponds       Carnegie Port Melbourne     Ascot Vale 
##             75             75             68             68             67 
##      Glen Iris       Brighton    Keilor East         Elwood   Williamstown 
##             67             62             62             61             61

Exploratory question 3: What is the average house price for properties sold 25km or less from the Central Business District?

cbd_mean <- sold %>% filter(Distance<=25) %>% summarise(mean= mean(Price))
cbd_mean
## # A tibble: 1 × 1
##       mean
##      <dbl>
## 1 1077674.

Q3 Recommendations for the estate agents would include prioritising generating clients within the Central Business District (CBD) as property sales yield on average 70% more if sold at or within a 25 km radius. Specifically exploring generating new listings and sales of houses as the number of houses sold are signifiantly higher than other property types according to the dataset. Houses also yield a higher listing/ sale price. However, it is recommended that the estate agents explore properties under development as these data records were incomplete and therefore not included in the analysis.

# Visualisation of property prices by distance from the CBD.
distance_gg <-ggplot()+ geom_line(aes(y= price, x= distance), data = property_market_iii)+ labs (title = "Property Price vs Distance from CBD", x ="Distance", y = "Property Price")
distance_gg 

# Visualisation of property prices by the type of property.
price_type <- ggplot()+ geom_boxplot(aes(y= price, x= type), data = property_market_iii)+ labs (title = "Property Price vs Type of Property", x ="Property Type", y = "Property Price")
price_type

# Visualisation of types of property by the number listed and sold.
type_no_gg <- ggplot()+ geom_bar(aes(type), data = property_market_iii)+ labs (title = "Number of Properties by Type", x ="Property Type")
type_no_gg