####The factors considered to predict the house price :
Rooms : Number of rooms
Price : Price in dollars
Method : S - property sold; SP - property sold prior; PI - property passed in; PN - sold prior not disclosed; SN - sold not disclosed; NB - no bid; VB - vendor bid; W - withdrawn prior to auction; SA - sold after auction; SS - sold after auction price not disclosed. N/A - price or highest bid not available.
Type : br - bedroom(s); h - house,cottage,villa, semi,terrace; u - unit, duplex; t - townhouse; dev site - development site; o res - other residential.
SellerG : Real Estate Agent
Date : Date sold
Distance : Distance from CBD
Regionname : General Region (West, North West, North, North east …etc)
Propertycount : Number of properties that exist in the suburb.
Bedroom2 : Scraped # of Bedrooms (from different source)
Bathroom : Number of Bathrooms
Car : Number of carspots
Landsize : Land Size
BuildingArea : Building Size
CouncilArea : Governing council for the area
Using the necesary libraries
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.3.0
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
Importing the dataset
#assigning a varibale *melb_data*
melb_data <- read.csv('melb_data.csv')
Summarizing data
#The variable is summarized to get the 5 point summary
summary(melb_data)
## X Suburb Address
## Min. : 1 Reservoir : 541 1/1 Clarendon St: 3
## 1st Qu.: 5937 Bentleigh East: 368 13 Robinson St : 3
## Median :11820 Richmond : 333 14 Arthur St : 3
## Mean :11827 Preston : 312 14 Rose St : 3
## 3rd Qu.:17734 Brunswick : 286 16 Smith St : 3
## Max. :23546 Essendon : 265 2 Bruce St : 3
## (Other) :16291 (Other) :18378
## Rooms Type Price Method
## Min. : 1.000 h:12095 Min. : 85000 PI: 2189
## 1st Qu.: 2.000 t: 2005 1st Qu.: 633000 S :12034
## Median : 3.000 u: 4296 Median : 880000 SA: 128
## Mean : 2.935 Mean :1056697 SP: 2349
## 3rd Qu.: 3.000 3rd Qu.:1302000 VB: 1696
## Max. :12.000 Max. :9000000
##
## SellerG Date Distance Postcode
## Nelson :2002 27/05/2017: 610 Min. : 0.00 Min. :3000
## Jellis :1759 23/09/2017: 591 1st Qu.: 6.30 1st Qu.:3046
## hockingstuart:1580 16/09/2017: 546 Median : 9.70 Median :3085
## Barry :1390 3/06/2017 : 525 Mean :10.39 Mean :3107
## Ray :1032 26/08/2017: 523 3rd Qu.:13.30 3rd Qu.:3149
## Buxton : 939 17/06/2017: 489 Max. :48.10 Max. :3978
## (Other) :9694 (Other) :15112 NA's :1 NA's :1
## Bedroom2 Bathroom Car Landsize
## Min. : 0.000 Min. :0.000 Min. : 0.000 Min. : 0.0
## 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 176.5
## Median : 3.000 Median :1.000 Median : 2.000 Median : 440.0
## Mean : 2.913 Mean :1.538 Mean : 1.616 Mean : 558.1
## 3rd Qu.: 3.000 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 651.0
## Max. :20.000 Max. :8.000 Max. :10.000 Max. :433014.0
## NA's :3469 NA's :3471 NA's :3576 NA's :4793
## BuildingArea YearBuilt CouncilArea Lattitude
## Min. : 0.0 Min. :1196 :6163 Min. :-38.18
## 1st Qu.: 93.0 1st Qu.:1950 Moreland :1163 1st Qu.:-37.86
## Median : 126.0 Median :1970 Boroondara :1160 Median :-37.80
## Mean : 151.2 Mean :1966 Moonee Valley: 999 Mean :-37.81
## 3rd Qu.: 174.0 3rd Qu.:2000 Darebin : 934 3rd Qu.:-37.76
## Max. :44515.0 Max. :2018 Glen Eira : 848 Max. :-37.41
## NA's :10634 NA's :9438 (Other) :7129 NA's :3332
## Longtitude Regionname Propertycount
## Min. :144.4 Southern Metropolitan :6343 Min. : 249
## 1st Qu.:144.9 Northern Metropolitan :5307 1st Qu.: 4294
## Median :145.0 Western Metropolitan :3887 Median : 6567
## Mean :145.0 Eastern Metropolitan :1995 Mean : 7518
## 3rd Qu.:145.1 South-Eastern Metropolitan: 680 3rd Qu.:10331
## Max. :145.5 Eastern Victoria : 78 Max. :21650
## NA's :3332 (Other) : 106 NA's :1
Viewing the names of the variables in the dataset
#The variable names in the dataset are viewed
names(melb_data)
## [1] "X" "Suburb" "Address" "Rooms"
## [5] "Type" "Price" "Method" "SellerG"
## [9] "Date" "Distance" "Postcode" "Bedroom2"
## [13] "Bathroom" "Car" "Landsize" "BuildingArea"
## [17] "YearBuilt" "CouncilArea" "Lattitude" "Longtitude"
## [21] "Regionname" "Propertycount"
Fitting a model - decision tree algorithm
Assigning a variable named fit to the model
#Assigning a variable named *fit* to the model
fit <- rpart(Price ~ Rooms + Bathroom + Landsize + BuildingArea + YearBuilt + Lattitude + Longtitude,data = melb_data)
Drawing and labelling the decision tree
#Drawing a decision tree and labelling the values to it
plot(fit,uniform=TRUE)
text(fit,cex=0.6)
Predicting the price of the houses
#Viewing the first 6 values of the dataset
head(melb_data)
## X Suburb Address Rooms Type Price Method SellerG
## 1 1 Abbotsford 85 Turner St 2 h 1480000 S Biggin
## 2 2 Abbotsford 25 Bloomburg St 2 h 1035000 S Biggin
## 3 4 Abbotsford 5 Charles St 3 h 1465000 SP Biggin
## 4 5 Abbotsford 40 Federation La 3 h 850000 PI Biggin
## 5 6 Abbotsford 55a Park St 4 h 1600000 VB Nelson
## 6 10 Abbotsford 129 Charles St 2 h 941000 S Jellis
## Date Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea
## 1 3/12/2016 2.5 3067 2 1 1 202 NA
## 2 4/02/2016 2.5 3067 2 1 0 156 79
## 3 4/03/2017 2.5 3067 3 2 0 134 150
## 4 4/03/2017 2.5 3067 3 2 1 94 NA
## 5 4/06/2016 2.5 3067 3 1 2 120 142
## 6 7/05/2016 2.5 3067 2 1 0 181 NA
## YearBuilt CouncilArea Lattitude Longtitude Regionname
## 1 NA Yarra -37.7996 144.9984 Northern Metropolitan
## 2 1900 Yarra -37.8079 144.9934 Northern Metropolitan
## 3 1900 Yarra -37.8093 144.9944 Northern Metropolitan
## 4 NA Yarra -37.7969 144.9969 Northern Metropolitan
## 5 2014 Yarra -37.8072 144.9941 Northern Metropolitan
## 6 NA Yarra -37.8041 144.9953 Northern Metropolitan
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
#Predicting the price of the house of first 6 data
predict(fit,head(melb_data))
## 1 2 3 4 5 6
## 706378.9 706378.9 1184840.0 1184840.0 1644287.4 706378.9
#Printig the actual price of the house
print(head(melb_data$Price))
## [1] 1480000 1035000 1465000 850000 1600000 941000
#Using *modelr* package to calculate the Mean Absolure Error
library(modelr)
#Calulating the mean absoulte error,which is predicted price - actual value
mae(model=fit,data=melb_data)
## [1] 322001.3
#splitting the data into two set
#Test data amd Train data
splitData <- resample_partition(melb_data,c(test=0.3,train=0.7))
#Checking the dimensions of the test and train data
sapply(splitData,dim)
## test train
## [1,] 5518 12878
## [2,] 22 22
fitting a model to the train data and calculating the MAE
#Predicting the price of the house in the train data
fit2 <- rpart(Price ~ Rooms + Bathroom + Landsize + BuildingArea + YearBuilt + Lattitude + Longtitude, data = splitData$train)
#Caluclating the MAE
mae(model=fit2,data = splitData$test)
## [1] 326640
#A function to get the maximum average error for a given max depth. You should pass in the target as the name of the target column and the predictors as vector where each item in the vector is the name of the column
get_mae <- function(maxdepth, target, predictors, training_data, testing_data){
predictors <- paste(predictors, collapse="+")
formula <- as.formula(paste(target,"~",predictors,sep = ""))
model <- rpart(formula, data = training_data,
control = rpart.control(maxdepth = maxdepth))
mae <- mae(model, testing_data)
return(mae)
}
#assigning values to the variables, so that it can be used in the function
target <- "Price"
predictors <- c("Rooms","Bathroom","Landsize","BuildingArea",
"YearBuilt","Lattitude","Longtitude")
for(i in 1:10){
mae <- get_mae(maxdepth = i, target = target, predictors = predictors,
training_data = splitData$train, testing_data = splitData$test)
print(glue::glue("Maxdepth: ",i,"\t MAE: ",mae))
}
## Maxdepth: 1 MAE: 404993.335226328
## Maxdepth: 2 MAE: 371425.741501442
## Maxdepth: 3 MAE: 349273.409930209
## Maxdepth: 4 MAE: 336043.478802731
## Maxdepth: 5 MAE: 329594.970626216
## Maxdepth: 6 MAE: 326639.98033566
## Maxdepth: 7 MAE: 326639.98033566
## Maxdepth: 8 MAE: 326639.98033566
## Maxdepth: 9 MAE: 326639.98033566
## Maxdepth: 10 MAE: 326639.98033566
#Fitting a Random Forest Algorithm and assigning to a variable *fit3*
fit3 <- randomForest(Price ~ Rooms + Bathroom + Landsize + BuildingArea + YearBuilt + Lattitude
+ Longtitude, data = splitData$train,na.action = na.exclude)
#Calculating the MAE after fitting a model using Random Forest aloorithm
mae(model=fit3,data = splitData$test)
## [1] 181352.7