# This is the R chunk for the required packages
library(readr)
library(readxl)
library(stringr)
library(dplyr)
library(tidyr)
library(lubridate)
library(Hmisc)
library(forecast)
library(infotheo)
#capping function
cap <- function(x){
quantiles <- quantile(x, c(.05, 0.25, 0.75, .95 ), na.rm = TRUE)
x[ x < quantiles[2] - 1.5*IQR(x,na.rm = TRUE)] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x,na.rm = TRUE)] <- quantiles[4]
x
}
For pre-processing the data was first changed from an un-tidy format to tidy for the Crime data set before merging into the housing data set. The idea is to find if incident reports effect the housing price. Once the Data is merged it is checked for values that make no sense. Having 0 bathrooms, 0 bedrooms, no land or no building size where some of the variables identified for having incorrect data. These values were checked with the summary () command. All values that shouldn’t be 0 got changed into NA to filled later and some values for land sizes were 0.01 so looking at the data we changed anything below 20m to also NA. Boxplots where then used to identify large outliers of the data whether they were too low or too high. As we are working with building/land sizes, bedrooms, bathrooms and carport, these values were capped to pull in line their lower and upper brackets without needing to remove data. Before filling the NA values, we look at the correlation between all independent variables and the dependant variable of Price. This helps identify columns that can be removed with very small correlations. Once removed rounded values like cars, bedrooms, bathrooms were imputed with the median while numeric values where were imputed with the mean. Once all data is filled, we check for normalisation across numeric values. Appropriate transformations were given to price, land and distance to have a better gaussian distribution, the reason for this is it helps when creating a linear regressor to predict the house price. Finally naming conventions were changed to better explain the transformation to the data set.
Dataset 1 is Melbourne_housing_FULL from: https://www.kaggle.com/anthonypino/melbourne-housing-market Variables: Suburb: Suburb Address: Address Rooms: Number of rooms Price: Price in Australian dollars Method: factor S - property sold; SP - property sold prior; PI - property passed in; PN - sold prior not disclosed; SN - sold not disclosed; NB - no bid; VB - vendor bid; W - withdrawn prior to auction; SA - sold after auction; SS - sold after auction price not disclosed. N/A - price or highest bid not available. Type: orderd factors h - house,cottage,villa, semi,terrace u - unit, duplex t - townhouse Seller: Real Estate Agent
Date: Date sold Distance: Distance from CBD in Kilometres Regionname: General Region (West, North West, North, North east …etc) Propertycount: Number of properties that exist in the suburb. Bedroom2 : Scraped # of Bedrooms (from different source) Bathroom: Number of Bathrooms Car: Number of carspots Landsize: Land Size in Metres BuildingArea: Building Size in Metres YearBuilt: Year the house was built CouncilArea: Governing council for the area Lattitude: Self explanitory Longtitude: Self explanitory
Data set 2 is Data_Tables_LGA_Victim_Reports_Year_Ending_December_2019 from https://www.crimestatistics.vic.gov.au/crime-statistics/latest-crime-data/download-data We use sheet named “Table 01” Variables: Year: Year of incident reports Year Ending: When the year ended Polic Region: The region Local Government Area: The local council power Victim Reports: Reports made by victims of an incident Rate per 100,000 population: reports scaled per 100k people
#house prices data from kaggle
housePrices <- read.csv('Melbourne_housing_FULL.csv', stringsAsFactors = FALSE)
#crime data from crime statistics.vic
crimeData <- read_excel('Data_Tables_LGA_Victim_Reports_Year_Ending_December_2019.xlsx', sheet = 'Table 01')
#data Sources Heads
head(housePrices, 5)
head(crimeData, 5)
NA
With the crimeData, it is currently in a long form. The column Year goes from 2019 to 2010 in years, to better prepare this for merging we will first be transforming the data to wide to fit better tidy practices.
#changing the data from long to wide (it was untidy) before merging
TidyCrime <- crimeData%>%
select(c("Year","Victim Reports","Local Government Area","Police Region"))%>%
group_by(Year)%>%
mutate(grouped_id = row_number())%>%
spread(Year,`Victim Reports`)%>%
select(-grouped_id)
head(TidyCrime, 5)
NA
Before merging we can see there are some possible features that we can create using the now tidied crimeData which is the average and total of incident reports over the course of 10 years
#Feature Engineering - creating average incidents for an area
featureTidyCrime <- TidyCrime%>%
rowwise()%>%
mutate('Average Incident Reports'=mean(c(`2010`,`2011`,`2012`,`2013`,`2014`,`2015`,`2016`,`2017`,`2018`,`2019`)))%>%
mutate('Total Reports in 10 Years'= sum(c(`2010`,`2011`,`2012`,`2013`,`2014`,`2015`,`2016`,`2017`,`2018`,`2019`)))
In order to merge the featureTidyCrime and housePrices we first need to establish a key to merge them at. We find the CouncilArea in housePrices is the same fields as Local Government Area in the crimeData. We rename the column before merging.
#removing city council and changing name to local govenrment area
housePrices$CouncilArea <- housePrices$CouncilArea %>% str_replace(" City Council","")
housePrices <- housePrices %>%
rename('Local Government Area' = CouncilArea)
#merging data sets using a left join and setting the key as Local Government Area
mergeddf <- housePrices %>%
left_join(featureTidyCrime, by='Local Government Area')
When getting the structure of mergeddf we find that we only have three different types: Character, int, num. Method, Suburb, Regionname, Local Government Area have been changed from character to factor as they are categorical observations and Type has been changed to ordered with levels as h,t,u. Type with ordering is as followed: Houses generally go for a higher price, have larger land and its percentage growth is generally higher than a townhouse and unit. Townhouse second as its closer to a house and still on it’s own block of land, while units may have body corporate applied and generally don’t own the land they are on.
#Lets view the structure first
str(mergeddf)
'data.frame': 34857 obs. of 34 variables:
$ Suburb : chr "Abbotsford" "Abbotsford" "Abbotsford" "Abbotsford" ...
$ Address : chr "68 Studley St" "85 Turner St" "25 Bloomburg St" "18/659 Victoria St" ...
$ Rooms : int 2 2 2 3 3 3 4 4 2 2 ...
$ Type : chr "h" "h" "h" "u" ...
$ Price : int NA 1480000 1035000 NA 1465000 850000 1600000 NA NA NA ...
$ Method : chr "SS" "S" "S" "VB" ...
$ SellerG : chr "Jellis" "Biggin" "Biggin" "Rounds" ...
$ Date : chr "3/09/2016" "3/12/2016" "4/02/2016" "4/02/2016" ...
$ Distance : chr "2.5" "2.5" "2.5" "2.5" ...
$ Postcode : chr "3067" "3067" "3067" "3067" ...
$ Bedroom2 : int 2 2 2 3 3 3 3 3 4 3 ...
$ Bathroom : int 1 1 1 2 2 2 1 2 1 2 ...
$ Car : int 1 1 0 1 0 1 2 2 2 1 ...
$ Landsize : int 126 202 156 0 134 94 120 400 201 202 ...
$ BuildingArea : num NA NA 79 NA 150 NA 142 220 NA NA ...
$ YearBuilt : int NA NA 1900 NA 1900 NA 2014 2006 1900 1900 ...
$ Local Government Area : chr "Yarra" "Yarra" "Yarra" "Yarra" ...
$ Lattitude : num -37.8 -37.8 -37.8 -37.8 -37.8 ...
$ Longtitude : num 145 145 145 145 145 ...
$ Regionname : chr "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
$ Propertycount : chr "4019" "4019" "4019" "4019" ...
$ Police Region : chr "1 North West Metro" "1 North West Metro" "1 North West Metro" "1 North West Metro" ...
$ 2010 : num 5785 5785 5785 5785 5785 ...
$ 2011 : num 5470 5470 5470 5470 5470 5470 5470 5470 5470 5470 ...
$ 2012 : num 5733 5733 5733 5733 5733 ...
$ 2013 : num 5544 5544 5544 5544 5544 ...
$ 2014 : num 5836 5836 5836 5836 5836 ...
$ 2015 : num 6122 6122 6122 6122 6122 ...
$ 2016 : num 6968 6968 6968 6968 6968 ...
$ 2017 : num 6409 6409 6409 6409 6409 ...
$ 2018 : num 7150 7150 7150 7150 7150 7150 7150 7150 7150 7150 ...
$ 2019 : num 6905 6905 6905 6905 6905 ...
$ Average Incident Reports : num 6192 6192 6192 6192 6192 ...
$ Total Reports in 10 Years: num 61922 61922 61922 61922 61922 ...
#lets apply proper transformations across them first we start with factors
#Type is house town house and unit and are ordered by what sells most in market
mergeddf$Type <- factor(mergeddf$Type, ordered = TRUE, levels = c("h","t",'u'))
mergeddf$Method <- as.factor(mergeddf$Method)
mergeddf$Suburb <- as.factor(mergeddf$Suburb)
mergeddf$Regionname <- as.factor(mergeddf$Regionname)
mergeddf$`Local Government Area` <- as.factor(mergeddf$`Local Government Area`)
#fix the date column as well by splitting it. We will be able to find better information
#about if a house sold in a certain month as opposed to the overall date.
#splitting the date column using lubridate
mergeddf$Date <- dmy(mergeddf$Date)
mergeddf <- mergeddf %>%
mutate(Year = year(Date), Month = month(Date), Day = day(Date)) %>%
select(-c("Date"))
#next the int variables
convertListNames <- c("Day","Month","Year","Postcode","Total Reports in 10 Years","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","Propertycount")
#for loop to apply the change
for (colName in convertListNames)
{
mergeddf[,colName] <- as.integer(mergeddf[,colName])
}
NAs introduced by coercionNAs introduced by coercion
#for loop for changing items into a double
dblListNames <- c("Distance", "Average Incident Reports")
for (colName in dblListNames)
{
mergeddf[,colName] <- as.double(mergeddf[,colName])
}
NAs introduced by coercion
There are a lot of odd values here, where bedroom, bathroom, land and building area have minimums of 0. This can’t be the case and is most likely mis-represented information. These values need to be changed to NA as you can’t have a house without land or a building and also without rooms or a toilet. There are also a lot of NA values that need to be filled. Median will be used to fill values that require a round number while the mode for categorical values and mean for double values. Landsize and BuildingArea also has values at 0.1 so we change all values below 20 to NA as this is also incorrectly retrieved data.
# This is the R chunk for the Scan I
summary(mergeddf)
Suburb Address Rooms Type Price Method SellerG
Reservoir : 844 Length:34857 Min. : 1.000 h:23980 Min. : 85000 S :19744 Length:34857
Bentleigh East: 583 Class :character 1st Qu.: 2.000 t: 3580 1st Qu.: 635000 SP : 5095 Class :character
Richmond : 552 Mode :character Median : 3.000 u: 7297 Median : 870000 PI : 4850 Mode :character
Glen Iris : 491 Mean : 3.031 Mean : 1050173 VB : 3108
Preston : 485 3rd Qu.: 4.000 3rd Qu.: 1295000 SN : 1317
Kew : 467 Max. :16.000 Max. :11200000 PN : 308
(Other) :31435 NA's :7610 (Other): 435
Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
Min. : 0.00 Min. :3000 Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0 Min. :1196
1st Qu.: 6.40 1st Qu.:3051 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 224.0 1st Qu.: 102.0 1st Qu.:1940
Median :10.30 Median :3103 Median : 3.000 Median : 2.000 Median : 2.000 Median : 521.0 Median : 136.0 Median :1970
Mean :11.18 Mean :3116 Mean : 3.085 Mean : 1.625 Mean : 1.729 Mean : 593.6 Mean : 160.3 Mean :1965
3rd Qu.:14.00 3rd Qu.:3156 3rd Qu.: 4.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 670.0 3rd Qu.: 188.0 3rd Qu.:2000
Max. :48.10 Max. :3978 Max. :30.000 Max. :12.000 Max. :26.000 Max. :433014.0 Max. :44515.0 Max. :2106
NA's :1 NA's :1 NA's :8217 NA's :8226 NA's :8728 NA's :11810 NA's :21115 NA's :19306
Local Government Area Lattitude Longtitude Regionname Propertycount Police Region
Boroondara: 3675 Min. :-38.19 Min. :144.4 Southern Metropolitan :11836 Min. : 83 Length:34857
Darebin : 2851 1st Qu.:-37.86 1st Qu.:144.9 Northern Metropolitan : 9557 1st Qu.: 4385 Class :character
Moreland : 2122 Median :-37.81 Median :145.0 Western Metropolitan : 6799 Median : 6763 Mode :character
Glen Eira : 2006 Mean :-37.81 Mean :145.0 Eastern Metropolitan : 4377 Mean : 7573
Melbourne : 1952 3rd Qu.:-37.75 3rd Qu.:145.1 South-Eastern Metropolitan: 1739 3rd Qu.:10412
Banyule : 1861 Max. :-37.39 Max. :145.5 Eastern Victoria : 228 Max. :21650
(Other) :20390 NA's :7976 NA's :7976 (Other) : 321 NA's :3
2010 2011 2012 2013 2014 2015 2016 2017
Min. : 1696 Min. : 1711 Min. : 1735 Min. : 1954 Min. : 2030 Min. : 2083 Min. : 2750 Min. : 2404
1st Qu.: 3736 1st Qu.: 3482 1st Qu.: 3736 1st Qu.: 3899 1st Qu.: 3914 1st Qu.: 3913 1st Qu.: 4673 1st Qu.: 3932
Median : 4571 Median : 4444 Median : 4451 Median : 4396 Median : 4446 Median : 4838 Median : 5849 Median : 4580
Mean : 5044 Mean : 5088 Mean : 5360 Mean : 5204 Mean : 5295 Mean : 5645 Mean : 6401 Mean : 5631
3rd Qu.: 5938 3rd Qu.: 6298 3rd Qu.: 6452 3rd Qu.: 6401 3rd Qu.: 7372 3rd Qu.: 7299 3rd Qu.: 8303 3rd Qu.: 7196
Max. :12501 Max. :11520 Max. :11623 Max. :11235 Max. :10504 Max. :11939 Max. :13657 Max. :13113
NA's :307 NA's :307 NA's :307 NA's :307 NA's :307 NA's :307 NA's :307 NA's :307
2018 2019 Average Incident Reports Total Reports in 10 Years Year Month Day
Min. : 2231 Min. : 2523 Min. : 2112 Min. : 21117 Min. :2016 Min. : 1.000 Min. : 1.00
1st Qu.: 3949 1st Qu.: 3977 1st Qu.: 3944 1st Qu.: 39440 1st Qu.:2016 1st Qu.: 5.000 1st Qu.: 9.00
Median : 5041 Median : 5700 Median : 4834 Median : 48341 Median :2017 Median : 7.000 Median :17.00
Mean : 5659 Mean : 5857 Mean : 5519 Mean : 55186 Mean :2017 Mean : 7.142 Mean :15.88
3rd Qu.: 7354 3rd Qu.: 7032 3rd Qu.: 7030 3rd Qu.: 70300 3rd Qu.:2017 3rd Qu.:10.000 3rd Qu.:24.00
Max. :12926 Max. :13919 Max. :12294 Max. :122937 Max. :2018 Max. :12.000 Max. :30.00
NA's :307 NA's :307 NA's :307 NA's :307
#use this to see if any values are preset at 0 that shouldn't be
mergeddf %>% sapply(function(x) which(x == 0))
$Suburb
integer(0)
$Address
integer(0)
$Rooms
integer(0)
$Type
integer(0)
$Price
integer(0)
$Method
integer(0)
$SellerG
integer(0)
$Distance
[1] 15115 15116 15775 15776 15777 17121 17122 17639 17640 18167 18608 18609 19056 19057 19553 19554 19555 19556 20102 20671 20672 20673
[23] 21346 21347 21348 21349 21350 22575 22576 22577 23292 23293 23887 23888 24535 24536 24537 24538 24539 24540 25205 26166 26167 26168
[45] 26169 27144 27145 27146 27147 27955 28860 28861 28862 28863 28864 28865 28866 29679 29680 29681 29682 29683 30575 30999 31914 31915
[67] 31916 31917 32710 32711 32712 33516 33517 33518 33519 34472 34473
$Postcode
integer(0)
$Bedroom2
[1] 219 1391 1425 1550 3718 3813 5548 10108 11194 11235 12014 12218 13559 14709 15295 16323 18305
$Bathroom
[1] 229 482 846 1010 1550 1773 1785 2717 2854 3718 3813 4435 4568 4612 4638 5591 5828 6243 6250 6264 6714 8107
[23] 9133 9175 9281 9390 10486 10519 10520 10651 10880 10916 11175 11194 11302 11652 12218 13059 13182 13516 14871 16065 16323 17719
[45] 17866 18305
$Car
[1] 3 5 11 12 14 37 45 59 62 134 135 136 137 138 139 141 151 152 155 157 160 162
[23] 164 166 167 168 170 176 178 180 187 188 189 191 194 219 229 268 272 367 373 382 407 410
[45] 415 431 432 451 482 511 521 558 590 815 833 840 845 846 928 929 1010 1017 1181 1188 1264 1550
[67] 1644 1670 1693 1717 1773 1791 1824 1895 1927 2030 2141 2156 2171 2179 2186 2187 2188 2194 2201 2206 2214 2233
[89] 2243 2267 2271 2274 2280 2281 2286 2290 2291 2295 2306 2336 2347 2352 2358 2496 2537 2689 2715 2717 2854 2867
[111] 2877 2878 2881 2884 2889 2890 2891 2892 2894 2896 2900 2902 2903 2905 2917 2924 3067 3081 3212 3217 3220 3226
[133] 3227 3230 3239 3240 3241 3242 3243 3253 3255 3259 3263 3274 3309 3371 3375 3393 3426 3434 3447 3452 3461 3535
[155] 3536 3538 3547 3555 3563 3567 3569 3570 3593 3688 3718 3749 3788 3813 3814 3818 3825 3826 3827 3847 3850 3856
[177] 3860 3868 3869 3871 3875 3879 3886 3887 3890 3895 3898 3903 3907 3913 3916 3921 3929 3941 3946 3984 3989 4003
[199] 4045 4094 4102 4152 4200 4203 4208 4212 4221 4222 4223 4224 4235 4237 4249 4251 4253 4254 4255 4257 4261 4263
[221] 4270 4272 4273 4278 4280 4284 4285 4293 4295 4301 4307 4308 4310 4311 4313 4317 4318 4319 4322 4328 4338 4340
[243] 4343 4346 4349 4352 4356 4367 4368 4370 4379 4383 4384 4394 4399 4410 4435 4446 4460 4468 4482 4484 4488 4490
[265] 4491 4521 4568 4601 4612 4638 4650 4761 5087 5114 5185 5188 5214 5217 5220 5225 5248 5272 5273 5290 5335 5337
[287] 5368 5386 5387 5388 5417 5421 5441 5442 5453 5457 5458 5471 5473 5537 5540 5544 5556 5591 5606 5649 5658 5766
[309] 5781 5792 5796 5806 5816 5818 5821 5828 5838 5847 5862 5894 5926 5952 6000 6015 6037 6051 6141 6156 6243 6249
[331] 6250 6264 6284 6296 6302 6310 6322 6460 6489 6644 6662 6666 6668 6676 6684 6689 6690 6695 6714 6716 6724 6732
[353] 6754 6762 6767 6771 6773 6777 6779 6799 6812 6862 6866 6881 6882 6900 6919 6934 7049 7059 7072 7084 7086 7098
[375] 7103 7115 7117 7133 7185 7205 7210 7221 7234 7235 7241 7245 7246 7256 7268 7281 7287 7298 7320 7323 7334 7348
[397] 7353 7356 7357 7374 7390 7392 7399 7401 7414 7448 7470 7593 7602 7608 7624 7755 7779 7784 7788 7798 7799 7801
[419] 7812 7831 7835 7841 7842 7846 7848 7860 7865 7866 7869 7870 7871 7875 7880 7890 7906 7909 7916 7936 7952 7960
[441] 7967 7972 7974 7975 7977 7992 8004 8005 8007 8030 8036 8045 8046 8060 8061 8066 8074 8099 8101 8107 8112 8132
[463] 8206 8212 8282 8427 8597 8715 8734 8754 8757 8766 8767 8773 8778 8779 8791 8792 8796 8798 8800 8802 8804 8807
[485] 8808 8809 8810 8816 8827 8838 8839 8842 8851 8864 8872 8873 8878 8884 8897 8905 8915 8928 8932 8933 8958 8960
[507] 8962 8965 8977 9002 9003 9066 9069 9076 9081 9084 9091 9092 9093 9094 9097 9100 9106 9107 9109 9113 9116 9117
[529] 9121 9122 9123 9124 9125 9127 9128 9129 9133 9134 9135 9136 9139 9140 9145 9147 9148 9162 9175 9179 9180 9181
[551] 9183 9185 9197 9201 9202 9209 9214 9242 9259 9260 9265 9266 9279 9281 9295 9301 9308 9314 9315 9318 9319 9332
[573] 9334 9340 9341 9342 9344 9355 9356 9362 9374 9381 9382 9390 9413 9418 9423 9427 9431 9449 9451 9452 9465 9480
[595] 9507 9510 9532 9547 9549 9555 9558 9570 9572 9580 9582 9583 9585 9591 9618 9620 9625 9626 9645 9651 9756 9759
[617] 9761 9790 9794 9799 9829 9840 9852 9879 9960 9972 10094 10100 10105 10253 10289 10301 10302 10312 10327 10347 10350 10356
[639] 10357 10381 10384 10388 10395 10396 10422 10448 10464 10485 10486 10496 10507 10519 10520 10569 10576 10578 10594 10620 10627 10633
[661] 10644 10651 10660 10669 10677 10712 10714 10725 10730 10754 10762 10769 10770 10777 10778 10801 10805 10811 10812 10816 10852 10861
[683] 10867 10869 10876 10880 10895 10897 10898 10902 10909 10916 10935 10940 10957 10958 10961 10971 10976 10990 11032 11038 11039 11046
[705] 11063 11064 11070 11084 11117 11121 11131 11133 11137 11138 11144 11148 11151 11163 11165 11166 11169 11175 11181 11189 11193 11194
[727] 11227 11229 11302 11326 11357 11368 11397 11428 11444 11505 11538 11558 11573 11581 11582 11583 11592 11652 11839 11854 11859 11963
[749] 11967 11974 11976 11979 11987 12086 12102 12103 12105 12106 12109 12111 12113 12115 12141 12146 12150 12153 12165 12167 12179 12218
[771] 12308 12382 12401 12404 12406 12407 12414 12421 12422 12426 12439 12472 12501 12506 12520 12535 12543 12545 12546 12547 12548 12549
[793] 12551 12587 12612 12613 12617 12618 12623 12624 12640 12675 12676 12678 12679 12683 12685 12687 12777 12779 12780 12781 12783 12784
[815] 12788 12789 12794 12802 12804 12805 12806 12812 12822 12825 12940 12947 12948 12955 12957 12958 12961 12967 12978 12998 13013 13014
[837] 13016 13025 13026 13028 13029 13059 13104 13147 13165 13182 13213 13215 13220 13290 13291 13292 13303 13312 13327 13332 13334 13349
[859] 13367 13399 13430 13431 13439 13441 13442 13447 13465 13466 13476 13479 13483 13485 13486 13488 13516 13523 13588 13589 13593 13597
[881] 13601 13604 13610 13612 13620 13622 13632 13639 13640 13649 13656 13657 13662 13669 13673 13675 13685 13691 13695 13705 13716 13717
[903] 13718 13720 13722 13725 13745 13756 13793 13844 13855 13863 13878 13886 13888 13893 13901 13902 13916 13918 13923 13924 13934 13937
[925] 13938 13940 13949 13956 13958 14023 14040 14043 14061 14063 14069 14077 14099 14102 14105 14109 14112 14113 14116 14119 14134 14150
[947] 14161 14171 14172 14196 14204 14205 14210 14216 14228 14229 14276 14279 14335 14338 14350 14351 14355 14372 14378 14381 14420 14435
[969] 14458 14459 14470 14533 14534 14535 14538 14543 14570 14589 14599 14623 14633 14637 14664 14724 14789 14815 14824 14834 14842 14854
[991] 14871 14891 14974 14990 14991 15000 15053 15058 15074 15129
[ reached getOption("max.print") -- omitted 631 entries ]
$Landsize
[1] 4 18 24 26 30 32 41 43 49 51 54 58 68 69 89 156 171 178 182 193 207 218 219 220 250 251
[27] 296 336 366 368 373 378 381 383 386 387 388 394 398 400 403 404 406 408 412 421 423 425 433 436 437 440
[53] 441 442 445 447 448 450 453 456 461 462 470 471 473 474 477 480 481 482 484 486 488 510 522 528 537 551
[79] 552 556 559 560 568 581 750 813 825 827 831 832 833 834 835 837 838 839 845 846 860 873 884 891 896 928
[105] 931 957 964 970 1005 1184 1205 1229 1232 1255 1299 1312 1349 1352 1355 1378 1403 1421 1427 1510 1516 1547 1557 1572 1587 1593
[131] 1630 1637 1649 1672 1686 1742 1743 1762 1772 1773 1781 1784 1785 1787 1796 1800 1802 1816 1819 1822 1825 1826 1844 1846 1847 1870
[157] 1878 1884 1889 1895 1896 1910 1911 1916 1963 1976 2056 2083 2102 2140 2150 2151 2154 2155 2157 2162 2165 2167 2168 2194 2198 2208
[183] 2213 2215 2228 2241 2246 2248 2256 2260 2261 2263 2266 2276 2282 2283 2284 2301 2309 2316 2318 2332 2334 2335 2338 2342 2346 2350
[209] 2352 2356 2360 2362 2375 2378 2380 2384 2385 2387 2388 2390 2393 2395 2401 2406 2409 2414 2417 2420 2426 2427 2428 2539 2549 2616
[235] 2624 2652 2688 2700 2724 2731 2734 2748 2753 2760 2791 2799 2841 2847 2860 2875 2901 2908 2911 2916 2918 2920 2926 2928 2933 2935
[261] 2936 2937 2938 2939 2942 2943 2947 2957 2960 2962 2964 2966 2968 2973 2976 2980 2981 2983 2987 2990 2991 2992 3005 3008 3009 3014
[287] 3019 3020 3031 3038 3041 3047 3050 3053 3054 3055 3056 3059 3063 3073 3074 3077 3079 3080 3083 3089 3095 3096 3104 3106 3115 3149
[313] 3188 3211 3230 3232 3246 3251 3254 3258 3265 3267 3276 3286 3298 3302 3303 3306 3317 3344 3348 3351 3366 3372 3379 3402 3420 3427
[339] 3438 3443 3446 3449 3454 3474 3533 3537 3539 3541 3544 3546 3548 3549 3552 3559 3564 3565 3566 3568 3571 3572 3574 3579 3581 3582
[365] 3583 3584 3586 3587 3588 3598 3607 3609 3620 3640 3670 3701 3718 3740 3759 3764 3774 3781 3789 3797 3801 3804 3811 3812 3813 3814
[391] 3816 3818 3819 3823 3824 3825 3826 3827 3836 3839 3841 3843 3845 3848 3849 3851 3853 3854 3857 3858 3860 3865 3866 3870 3871 3877
[417] 3878 3879 3880 3881 3882 3883 3886 3887 3888 3889 3890 3892 3895 3897 3902 3903 3904 3907 3908 3909 3910 3911 3912 3915 3917 3921
[443] 3923 3924 3925 3926 3927 3928 3930 3932 3933 3935 3937 3940 3941 3948 3954 3956 3959 3965 3975 3982 3989 3994 3995 4009 4015 4019
[469] 4023 4028 4035 4039 4043 4046 4053 4061 4067 4073 4074 4076 4081 4086 4088 4090 4094 4095 4097 4108 4111 4116 4121 4130 4131 4135
[495] 4137 4138 4141 4146 4147 4161 4163 4166 4168 4170 4172 4177 4178 4189 4191 4192 4193 4195 4197 4201 4202 4204 4205 4211 4212 4213
[521] 4214 4225 4227 4229 4234 4237 4239 4242 4243 4244 4246 4247 4262 4269 4276 4296 4297 4298 4306 4307 4320 4323 4324 4329 4333 4336
[547] 4337 4341 4342 4348 4350 4359 4360 4361 4362 4364 4365 4366 4373 4375 4377 4379 4381 4386 4388 4392 4394 4397 4403 4404 4406 4408
[573] 4409 4411 4412 4419 4425 4426 4429 4432 4435 4436 4438 4442 4444 4448 4450 4455 4457 4463 4474 4475 4476 4479 4480 4487 4488 4489
[599] 4493 4514 4525 4535 4540 4557 4579 4581 4586 4591 4609 4615 4622 4634 4636 4638 4650 4665 4677 4682 4687 4696 4701 4704 4724 4737
[625] 4805 4810 4820 4840 4922 4997 5008 5015 5023 5039 5048 5060 5074 5093 5094 5099 5100 5105 5109 5112 5123 5136 5161 5174 5179 5186
[651] 5188 5189 5193 5194 5195 5197 5206 5208 5209 5211 5212 5218 5219 5220 5222 5228 5229 5232 5233 5234 5235 5241 5242 5244 5248 5250
[677] 5252 5255 5257 5261 5262 5264 5269 5270 5271 5275 5280 5281 5287 5288 5289 5293 5295 5301 5304 5305 5306 5309 5312 5314 5317 5318
[703] 5323 5327 5328 5331 5332 5341 5348 5350 5351 5355 5356 5358 5359 5360 5363 5414 5463 5500 5503 5514 5515 5522 5525 5529 5535 5536
[729] 5544 5552 5564 5566 5575 5591 5595 5607 5621 5639 5642 5732 5749 5768 5770 5771 5776 5797 5804 5805 5814 5816 5817 5821 5828 5832
[755] 5836 5837 5848 5850 5851 5853 5854 5856 5863 5869 5871 5884 5885 5888 5889 5892 5893 5895 5906 5924 5943 5945 5947 5949 5956 5958
[781] 5963 5964 5990 6003 6013 6014 6017 6024 6028 6041 6050 6059 6062 6070 6072 6083 6089 6094 6120 6141 6144 6176 6181 6189 6193 6195
[807] 6207 6217 6226 6229 6230 6231 6234 6241 6245 6247 6248 6250 6252 6253 6254 6264 6271 6273 6274 6275 6276 6278 6281 6283 6288 6292
[833] 6301 6303 6307 6316 6318 6320 6324 6326 6344 6353 6387 6391 6394 6397 6413 6421 6433 6439 6451 6453 6465 6467 6471 6473 6486 6490
[859] 6492 6493 6495 6497 6503 6521 6522 6525 6533 6534 6569 6571 6575 6576 6582 6589 6601 6615 6619 6632 6640 6641 6644 6651 6659 6661
[885] 6662 6665 6668 6672 6676 6677 6679 6680 6682 6683 6684 6686 6688 6689 6690 6694 6706 6709 6712 6713 6714 6716 6717 6718 6723 6724
[911] 6727 6728 6738 6739 6741 6742 6756 6763 6765 6768 6782 6802 6814 6820 6828 6829 6832 6833 6855 6857 6858 6860 6867 6876 6880 6888
[937] 6899 6900 6904 6910 6915 6924 6928 6931 6933 6941 6945 6952 7016 7018 7029 7042 7058 7061 7079 7081 7090 7145 7146 7157 7193 7206
[963] 7207 7208 7209 7213 7214 7216 7218 7219 7220 7222 7223 7226 7227 7229 7231 7235 7237 7239 7240 7242 7243 7247 7248 7251 7254 7259
[989] 7262 7264 7265 7270 7271 7275 7279 7283 7292 7309 7312 7315
[ reached getOption("max.print") -- omitted 1437 entries ]
$BuildingArea
[1] 7212 19776 19841 20039 20224 20263 20577 21156 21511 21562 21743 21859 21869 21955 22041 22211 22508 22682 22932 22995 23023 23086
[23] 23116 23128 23160 23243 23251 23322 23379 23380 23655 23691 24117 24129 24197 24206 24242 24259 24277 24345 24603 24640 25078 25087
[45] 25321 25353 25377 25413 25568 25709 25902 26101 26344 26619 26634 26839 27442 27565 27588 27630 27923 28728 28782 29457 30915 31465
[67] 31510 31542 31718 32404 32841 32937 33398 33661 33900 34084
$YearBuilt
integer(0)
$`Local Government Area`
integer(0)
$Lattitude
integer(0)
$Longtitude
integer(0)
$Regionname
integer(0)
$Propertycount
integer(0)
$`Police Region`
integer(0)
$`2010`
integer(0)
$`2011`
integer(0)
$`2012`
integer(0)
$`2013`
integer(0)
$`2014`
integer(0)
$`2015`
integer(0)
$`2016`
integer(0)
$`2017`
integer(0)
$`2018`
integer(0)
$`2019`
integer(0)
$`Average Incident Reports`
integer(0)
$`Total Reports in 10 Years`
integer(0)
$Year
integer(0)
$Month
integer(0)
$Day
integer(0)
#changing certain values of 0 to NA to change to mean/median
missingInfo <- c("Bathroom","Landsize","BuildingArea","Bedroom2")
#changing land and building area to accept nothing under 20m
mergeddf$Landsize[mergeddf$Landsize<20] <- NA
mergeddf$BuildingArea[mergeddf$BuildingArea<20] <- NA
#changing in the dataframe values == 0 to NA
mergeddf$Bedroom2[mergeddf$Bedroom2==0] <- NA
mergeddf$Bathroom[mergeddf$Bathroom==0] <- NA
For all numeric values we applying a capping procedure to cap the outliers in the bracket of the lower and upper quartiles instead of needed to remove the variables. First we take a look at the box plot of all numerics before applying capping to the variables that make the most sense. As the outliers aren’t large quantities and we’d like to keep as much data as possible capping would be oneo f the better choices here.
#look at all boxplots for numerics for outtlier detection
par(mfrow=c(2,5)) # sets the box plots to group
names <- colnames(mergeddf)
for (i in 1:length(colnames(mergeddf)))
{
if(is.numeric(mergeddf[,i])) boxplot(unlist(mergeddf[,i]), main=names[i])
}
#capping the outliers of numeric values based on the lower and upper quantiles
mergeddf$Rooms <- cap(mergeddf$Rooms)
mergeddf$Bedroom2 <- cap(mergeddf$Bedroom2)
mergeddf$Bathroom <- cap(mergeddf$Bathroom)
mergeddf$Car <- cap(mergeddf$Car)
mergeddf$Landsize <- cap(mergeddf$Landsize)
mergeddf$BuildingArea <- cap(mergeddf$BuildingArea)
We check for Pearson’s correlation coefficient to find variables that are have a higher correlation to the dependant variable Price. As when making a linear regression model higher correlated either in the negative or positive range will help when finding the future price of unknown houses. Once we have selected the features we impute the median across round values and mean for doubles.
#temp to check correlation for feature selection
#we would want to find correlation between the dependant variable price
tempdf <- na.omit(mergeddf)
corNames <- colnames(mergeddf)
for (i in corNames){
print(i)
if(is.numeric(tempdf[,i])) print(cor(tempdf[,i],tempdf$Price, method = "pearson"))
}
[1] "Suburb"
[1] "Address"
[1] "Rooms"
[1] 0.4235927
[1] "Type"
[1] "Price"
[1] 1
[1] "Method"
[1] "SellerG"
[1] "Distance"
[1] -0.3175514
[1] "Postcode"
[1] 0.07084243
[1] "Bedroom2"
[1] 0.4103007
[1] "Bathroom"
[1] 0.4083843
[1] "Car"
[1] 0.1655207
[1] "Landsize"
[1] 0.1902213
[1] "BuildingArea"
[1] 0.5523834
[1] "YearBuilt"
[1] -0.304047
[1] "Local Government Area"
[1] "Lattitude"
[1] -0.2543872
[1] "Longtitude"
[1] 0.232213
[1] "Regionname"
[1] "Propertycount"
[1] -0.04717755
[1] "Police Region"
[1] "2010"
[1] -0.174826
[1] "2011"
[1] -0.2362931
[1] "2012"
[1] -0.2560508
[1] "2013"
[1] -0.2410588
[1] "2014"
[1] -0.2731873
[1] "2015"
[1] -0.2541003
[1] "2016"
[1] -0.2305344
[1] "2017"
[1] -0.2348339
[1] "2018"
[1] -0.2250127
[1] "2019"
[1] -0.1961832
[1] "Average Incident Reports"
[1] -0.2355384
[1] "Total Reports in 10 Years"
[1] -0.2355384
[1] "Year"
[1] -0.05243772
[1] "Month"
[1] 0.04571516
[1] "Day"
[1] 0.003366343
#remove variables without great Pearson's correlation coefficient prior to imputing, Also remove items
#that wouldnt help with finding a price of a house e.g year it sold wouldnt help
FeatSeldf <- mergeddf %>% select(-c("Postcode", "Lattitude", "Longtitude", "Propertycount", "Day", "Year", 21:31, "Police Region", "SellerG","Address"))
#impute the values now that are missing
#using median to keep the round number for cols that need a round number
varsMedian <- c("Bathroom", "Bedroom2", "YearBuilt", "Car", "Total Reports in 10 Years") #creating a vector
for (i in varsMedian){
FeatSeldf[[i]] <- impute(FeatSeldf[[i]],median)#once found impute with mean
}
varsMean <- c("Landsize", "BuildingArea", "Distance", "Average Incident Reports", "Price")
for (i in varsMean){
FeatSeldf[[i]] <- impute(FeatSeldf[[i]],mean)#once found impute with mean
}
#check for any remaining missing values
colSums(is.na(FeatSeldf))
Suburb Rooms Type Price Method
0 0 0 0 0
Distance Bedroom2 Bathroom Car Landsize
0 0 0 0 0
BuildingArea YearBuilt Local Government Area Regionname Average Incident Reports
0 0 0 0 0
Total Reports in 10 Years Month
0 0
We check the histogram of our numeric values and transform them accordingly. Price, Distance and both right skewed graphs so we will see if we can create a gaussian distribution out of them and we will see if can can make a similair with landsize. The reasoning behind this is with linear regrerssion it is easier to apply it across a bell curve and get a closer answer for a predictor. With right skewed graphs applying a log transformation will help with normalising the data.
par(mfrow=c(2,5)) # sets the box plots to group
#Now for some transformation of data
#visualise the current skew of our data
names <- colnames(FeatSeldf)
for (i in 1:length(colnames(FeatSeldf)))
{
if(is.numeric(FeatSeldf[,i])) hist(unlist(FeatSeldf[,i]), main=names[i])
}
#make the skews into a normal distribution
logPrice <- log(FeatSeldf$Price)
hist(logPrice)
#lambda = 0 is also the same as applying a log transformation (just wanted to show you there are other ways to use boxcox)
boxCoxDist <- BoxCox(FeatSeldf$Distance, lambda = 0)
hist(boxCoxDist)
#boxcox lambda using auto
land <- BoxCox(FeatSeldf$Landsize, lambda = "auto")
hist(land)
#replace the values
FeatSeldf$Price <- logPrice
FeatSeldf$Distance <- boxCoxDist
FeatSeldf$Landsize <- land
#rename for better purposes
FeatSeldf <- FeatSeldf %>% rename(logPrice = Price, logDistance = Distance, BoxLandsize = Landsize, Bedroom = Bedroom2)
str(FeatSeldf)
'data.frame': 34857 obs. of 17 variables:
$ Suburb : Factor w/ 351 levels "Abbotsford","Aberfeldie",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Rooms : num 2 2 2 3 3 3 4 4 2 2 ...
$ Type : Ord.factor w/ 3 levels "h"<"t"<"u": 1 1 1 3 1 1 1 1 1 1 ...
$ logPrice : 'impute' num 13.9 14.2 13.8 13.9 14.2 ...
..- attr(*, "imputed")= int 1 4 8 9 10 13 14 21 32 35 ...
$ Method : Factor w/ 9 levels "PI","PN","S",..: 7 3 3 8 6 1 8 5 3 3 ...
$ logDistance : 'impute' num 0.916 0.916 0.916 0.916 0.916 ...
..- attr(*, "imputed")= int 29484
..- attr(*, "lambda")= num 0
$ Bedroom : 'impute' num 2 2 2 3 3 3 3 3 4 3 ...
..- attr(*, "imputed")= int 16 17 20 22 23 27 28 34 35 46 ...
$ Bathroom : 'impute' num 1 1 1 2 2 2 1 2 1 2 ...
..- attr(*, "imputed")= int 16 17 20 22 23 27 28 34 35 46 ...
$ Car : 'impute' num 1 1 0 1 0 1 2 2 2 1 ...
..- attr(*, "imputed")= int 16 17 20 22 23 27 28 34 35 46 ...
$ BoxLandsize : 'impute' num 32.4 44.2 37.3 82.5 33.7 ...
..- attr(*, "imputed")= int 4 16 17 18 20 22 23 24 26 27 ...
..- attr(*, "lambda")= num 0.635
$ BuildingArea : 'impute' num 152 152 79 152 150 ...
..- attr(*, "imputed")= int 1 2 4 6 9 10 11 16 17 18 ...
$ YearBuilt : 'impute' int 1970 1970 1900 1970 1900 1970 2014 2006 1900 1900 ...
..- attr(*, "imputed")= int 1 2 4 6 11 16 17 18 20 22 ...
$ Local Government Area : Factor w/ 34 levels "#N/A","Banyule",..: 33 33 33 33 33 33 33 33 33 33 ...
$ Regionname : Factor w/ 9 levels "#N/A","Eastern Metropolitan",..: 4 4 4 4 4 4 4 4 4 4 ...
$ Average Incident Reports : 'impute' num 6192 6192 6192 6192 6192 ...
..- attr(*, "imputed")= int 14338 14346 14358 14390 14427 14508 14662 14663 14942 15153 ...
$ Total Reports in 10 Years: 'impute' num 61922 61922 61922 61922 61922 ...
..- attr(*, "imputed")= int 14338 14346 14358 14390 14427 14508 14662 14663 14942 15153 ...
$ Month : int 9 12 2 2 3 3 6 8 8 8 ...