This notebook provides a comprehensive modeling framework for estimating the current market value of homes in Riverside County.
libraries <- c("taRifx", "ggplot2", "plyr", "dplyr", "corrplot", "caret", "gridExtra", "scales", "Rmisc", "ggrepel", "randomForest", "psych", "xgboost", "Metrics")
lapply(libraries, require, character.only = TRUE)
## Loading required package: taRifx
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:taRifx':
##
## between, distinct, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: corrplot
## corrplot 0.84 loaded
## Loading required package: caret
## Loading required package: lattice
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Loading required package: scales
## Loading required package: Rmisc
## Loading required package: ggrepel
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Loading required package: psych
##
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: xgboost
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
## Loading required package: Metrics
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] TRUE
##
## [[6]]
## [1] TRUE
##
## [[7]]
## [1] TRUE
##
## [[8]]
## [1] TRUE
##
## [[9]]
## [1] TRUE
##
## [[10]]
## [1] TRUE
##
## [[11]]
## [1] TRUE
##
## [[12]]
## [1] TRUE
##
## [[13]]
## [1] TRUE
##
## [[14]]
## [1] TRUE
Import Raw Data Tree Data
df.DataTree <- read.csv("~/Models/DataTree/Raw/06065.csv", sep="|", na.strings=c(""," ","NA", "\\N"), stringsAsFactors = FALSE, fill=TRUE, quote="", colClasses = "character")
The below scripts do the following:
source("~/Models/Source/APN_Formatting.R")
source("~/Models/Source/Extract_Residential.R")
source("~/Models/Source/GeoID_processing.R")
df.AVM <- df.DataTree %>%
select("apn", "CurrSalesPrice", "CurrSaleRecordingDate", "CurrSaleDocumentType", "SumResidentialUnits", "GarageParkingNbr", "BathTotalCalc", "Bedrooms" , "BuildingArea", "AssdImprovementValue", "AssdLandValue", "AssdTotalValue", "LotSizeSqFt", "SitusStdZIP5", "CurrSalesPriceCode", "SumLivingAreaSqFt", "YearBuilt", "StoriesNbrCode", "RoofCoverCode", "WaterCode", "SewerCode", "BuildingQualityCode", "SumGrossAreaSqFt", "Garage", "HeatCode", "SumGarageSqFt", "Municipality", "TaxAmount", "AirConditioningCode", "FireplaceCode", "OwnerOccupied", "StyleCode", "PoolCode", "OwnerStd1CorpInd", "SitusStdUnitType", "SiteInfluenceCode", "PropertyClassID", "StdLandUseCode", "SitusStdZIP5","geoID", "PropertyType")
rm(df.DataTree)
Variables are converted to character during import to preserve data, which is why we must assign numeric variables based on review.
numericVarName <- c("CurrSalesPrice", "BathTotalCalc", "Bedrooms", "LotSizeSqFt", "AssdTotalValue", "AssdLandValue", "AssdImprovementValue", "YearBuilt","GarageParkingNbr", "TaxAmount", "BuildingArea", "SumLivingAreaSqFt", "SumGrossAreaSqFt", "SumGarageSqFt", "SumResidentialUnits")
df.AVM[,numericVarName] <- as.numeric(unlist(df.AVM[,numericVarName]))
We must generate census location and assessor map-book location variables for later imputation of missing data.
df.AVM$geoTract <- substr(df.AVM$geoID, 1,11)
df.AVM$geoBlockGroup <- substr(df.AVM$geoID, 1,12)
df.AVM$geoBlock <- substr(df.AVM$geoID, 1,15)
df.AVM$apnMapBookPgNbr <- substr(df.AVM$apn, 1, 6)
df.AVM$apnMapBook <- substr(df.AVM$apn, 1, 3)
#Obtain Sale Year
df.AVM$yearSale <- substr(df.AVM$CurrSaleRecordingDate,1,4)
df.AVM$monthSale <- substr(df.AVM$CurrSaleRecordingDate,5,6)
For data imputation, we will proceed in the following manner. Identify if missing at random or some structure to the missing data, such as the NA in pool indicating the property has no pool. We will also convert character variables into ordinal integers if there is clear ordinality, or into factors if levels are categories without ordinality; factors will be converted into numeric later on by using one-hot encoding (using the model.matrix function).
First let’s create a function for calculating the mode of some of our categorical variables for later group imputation.
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
Sum missing variables:
missingVar <- as.data.frame(sort(colSums(sapply(df.AVM, is.na)), decreasing = TRUE))
View(missingVar)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/modules/R_de.so'' had status 1
1 Qualifies as a comparable sale as defined by the county. 2 Liens Exceed Value or Assumption of Loan exceed market value. 3 Sales Price computed from the Transfer Tax, Transfer Tax not keyed. 4 Sales Price amount or Transfer Tax unreadable. 5 Unable to calculate Sales Price from given Transfer Tax. 6 Non-arms length transaction. 7 Sales Price or Transfer Tax rounded by county prior to computation. 8 Sold for Taxes. 9 Unable to calculate Sales Price from given Transfer Tax. 10 Transfer Tax on document indicated as EXEMPT. 11 Document states price as “0”, “None”, “No Consideration”. 12 Sales Price computed using current Excise Tax Rate based on city name (WA) 13 Full amount computed from Transfer Tax or Excise Tax. 14 Full amount stated on Document. 15 Partial amount computed from Transfer Tax. 16 Sales price from Transfer Tax. 17 Comparable Market Value. 18 Exchange (HI) 19 Full amount from assessment file, when available. 20 Document states that Price/Transfer Tax is not a matter of public record. 21 From recorded Affidavit of Value or Verified. 22 Full amount from assessment file, when available. 23 Unable to compute 24 Comparable Market Value. 25 Full amount stated on Document. 26 Exchange (HI) 27 Full amount computed from Transfer Tax or Excise Tax. 28 Amount of redemption. Only used for Redemption Deeds. (DocType = “RD”) 29 Sales Price computed from transfer tax based on either full consideration or assessed value (NE and Kenosha WI only) 30 Judgment Amount 31 Liens Exceed Value or Assumption of Loan. Liens/Loans involved in the transaction exceed market value. 32 Sales Price computed from Transfer Tax, Transfer Tax not keyed. 33 Non-arms length transaction. 34 Partial amount computed from Transfer Tax. 35 Sales Price or Transfer Tax rounded by county prior to computation. 36 Sold for Taxes. 37 Transfer Tax on document indicated as EXEMPT. 38 Sales price computed from Transfer Tax. 39 From recorded Affidavit of Value or Verified. 40 Sales Price from Excise Tax Rate (WA only ) 41 Price/Transfer Tax not public record. 42 Document states price as “0”, “None”, “No Consideration”. 43 Estimated Sales Price
table(df.AVM$CurrSalesPriceCode)
##
## 10 11 13 14 15 16 19 2 20 21
## 47 85 369098 9623 1586 32997 19 1 119 3
## 23 25 27 3 30 34 35 38 4 41
## 193 58 5305 2 51 7 1060 279 14 62
## 42 5 6 7 8
## 1 342 92 98863 77
df.AVM$CurrSalesPriceCode[is.na(df.AVM$CurrSalesPriceCode)] <- "27"
ggplot(df.AVM[!is.na(df.AVM$CurrSalesPrice) & df.AVM$yearSale == "2017",], aes(x=CurrSalesPriceCode, y=as.numeric(CurrSalesPrice))) +
geom_bar(stat='summary', fun.y = "median", fill='blue') +
scale_y_continuous(breaks= seq(0, 1000000, by=50000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..))
Looking at the price distribution, there doesn’t appear to be much difference in the median sale price for the various CurrSalePriceCodes.
The pool codes are listed below, however it appears that PoolCode = 8 is the only value listed in the data set. If pool code is missing, we are to assume that the property does not have a pool. Directive, create dummy variable hasPool indicating presence of pool since only two values.
1 Above ground pool 2 Pool & Spa (both) 3 Community Pool or Spa 4 Enclosed 5 Heated Pool 6 Indoor Swimming Pool 7 Solar Heated 8 Pool (yes) 9 Spa or Hot Tub (only) 10 Vinyl In-ground Pool
table(df.AVM$PoolCode)
##
## 8
## 128523
df.AVM$PoolCode[is.na(df.AVM$PoolCode)] <- "0"
See LOC_INFLNCE sheet in DataTree Assessor Field Layout.
A Freeway proximity B WATERFRONT-BEACH (Ocean, River or Lake) C Contamination site D Cul-de-sac E CORNER F NEGATIVE VIEW (POOR, BAD, ETC.) G AVERAGE VIEW H Historical I SCHOOL PROXIMITY J ADJACENT TO GOLF COURSE K NO VIEW L LAKE VIEW M MOUNTAIN VIEW N WATERFRONT - CANAL O OCEAN VIEW P Proximity to AIRPORT Q GREEN BELT R Proximity to RAILROAD S Major street/thoroughfare T High traffic area U RIVER VIEW V View not specified W Waterfront-Not Specified Z FLOOD PLAIN/FLOOD ZONE
table(df.AVM$SiteInfluenceCode)
##
## J W
## 38916 3835
Assign “None” for missing data indicating there is no SiteInfluencecode.
df.AVM$SiteInfluenceCode[is.na(df.AVM$SiteInfluenceCode)] <- "None"
We’ll assume that homes with the same number of bedrooms, contained in the same blockgroup, have the same living area squarefootage, as is typical with track homes.
df.AVM <- df.AVM %>%
group_by(geoBlock, Bedrooms) %>%
mutate(meanLivingAreaSqft = mean(SumLivingAreaSqFt))
df.AVM$SumLivingAreaSqFt[is.na(df.AVM$SumLivingAreaSqFt)] <- df.AVM$BuildingArea[is.na(df.AVM$SumLivingAreaSqFt)]
df.AVM$SumLivingAreaSqFt[is.na(df.AVM$SumLivingAreaSqFt)] <- df.AVM$meanLivingAreaSqft[is.na(df.AVM$SumLivingAreaSqFt)]
df.AVM <- ungroup(df.AVM)
df.AVM$meanLivingAreaSqft <- NULL
Let’s summarize SumGarageSqFt by Garage. It appears we can recode NAs to 0. Code 3 for garage indicates carport, which makes sense why SumGarageSqft is missing.
df.AVM %>%
group_by(Garage) %>%
summarize(avg = mean(SumGarageSqFt, na.rm=TRUE))
## # A tibble: 6 x 2
## Garage avg
## <chr> <dbl>
## 1 1 537.
## 2 10 432
## 3 3 306.
## 4 4 454.
## 5 7 559.
## 6 <NA> NaN
df.AVM$SumGarageSqFt[is.na(df.AVM$SumGarageSqFt)] <- 0
Replace missing YearBuilt by median of geoBlock
df.AVM <- df.AVM %>%
group_by(geoBlock) %>%
mutate(medYearBuilt = median(round(YearBuilt,digits = 0), na.rm=TRUE))
df.AVM$YearBuilt[is.na(df.AVM$YearBuilt)] <- df.AVM$medYearBuilt[is.na(df.AVM$YearBuilt)]
df.AVM$YearBuilt <- round(df.AVM$YearBuilt)
df.AVM <- ungroup((df.AVM))
df.AVM$medYearBuilt <- NULL
There may be some exponetial relationship to explore here.
df.AVM$propertyAge <- 2019 - df.AVM$YearBuilt
Plot propertyAge with CurrSalePrice
tb1 <- ggplot(data=df.AVM[!is.na(df.AVM$CurrSalesPrice) & df.AVM$CurrSalesPrice < 1000000,], aes(x=propertyAge, y=CurrSalesPrice))+
geom_point(col='blue') + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
scale_y_continuous(labels = comma)
tb2 <- ggplot(data=df.AVM, aes(x=propertyAge)) +
geom_histogram(stat='count')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
grid.arrange(tb1, tb2)
## Warning: Removed 230 rows containing non-finite values (stat_smooth).
## Warning: Removed 230 rows containing missing values (geom_point).
## Warning: Removed 604 rows containing non-finite values (stat_count).
Garage is tricky, are we to assume that NA’s indicate no garage? There’s an indicator for “None” (11) yet no 11 values in the data, so this may not be the case. After tabling Garage and GarageParkingNbr it appears NA’s for Garage have 0 for GarageParkingNbr, therefore we will recode NA’s to 11.
1 Attached Garage 2 Built-in 3 Carport 4 Detached Garage 5 Pole 6 Offsite 7 Garage 8 Unimproved 9 Parking Lot 10 Mixed 11 None 12 Open 13 Paved/Surfaced 14 Ramp 15 Parking Structure 16 Tuckunder 17 Underground/Basement 18 Covered 19 Yes - Unspecified
table(df.AVM$Garage)
##
## 1 10 3 4 7
## 514239 1 22687 39659 641
df.AVM %>%
count(Garage, GarageParkingNbr)
## # A tibble: 81 x 3
## Garage GarageParkingNbr n
## <chr> <dbl> <int>
## 1 1 0 1222
## 2 1 1 34422
## 3 1 2 436207
## 4 1 3 31355
## 5 1 4 8484
## 6 1 5 1593
## 7 1 6 504
## 8 1 7 187
## 9 1 8 108
## 10 1 9 46
## # ... with 71 more rows
df.AVM$Garage[is.na(df.AVM$Garage)] <- "11"
Replace missing number of stories by median stories in geoBlock.
1 1 story with attic 2 1 story with basement 3 2 story with basement 4 3 Split Levels 5 4 Split Levels 6 with attic 7 with attic and basement 8 with basement 9 Bi-level 10 Split Entry 11 Split-Foyer 12 Split Level
table(df.AVM$StoriesNbrCode)
##
## 100 200 300 400
## 394528 206746 3214 2504
df.AVM <- df.AVM %>%
group_by(geoBlock, Bedrooms) %>%
mutate(modeStoriesNbrCode = Mode(StoriesNbrCode))
df.AVM$StoriesNbrCode[is.na(df.AVM$StoriesNbrCode)] <- df.AVM$modeStoriesNbrCode[is.na(df.AVM$StoriesNbrCode)]
df.AVM <- ungroup(df.AVM)
df.AVM$modeStoriesNbrCode <- NULL
1 Asbestos 2 Built-up 3 Composition Shingle 4 Concrete 5 Metal 6 Slate 7 Gravel/Rock 8 Tar & Gravel 9 Bermuda 10 Masonite/ Cement Shake 11 Fiberglass 12 Aluminum 13 Wood Shake/ Shingles 14 Other 15 Asphalt 16 Roll Composition 17 Steel 18 Tile 19 Urethane 20 Shingle (Not Wood) 21 Wood 22 Gypsum
table(df.AVM$RoofCoverCode)
##
## 13 18 7
## 51294 406677 148792
df.AVM$RoofCoverCode[is.na(df.AVM$RoofCoverCode)] <- "18"
1 Municipal 2 None 3 Storm 4 Septic 5 Yes
table(df.AVM$SewerCode)
##
## 1 2 5
## 548235 56836 54
df.AVM$SewerCode[is.na(df.AVM$SewerCode)] <- "1"
table(df.AVM$WaterCode)
##
## 3 6
## 5599 600847
df.AVM$WaterCode[is.na(df.AVM$WaterCode)] <- "6"
summary(df.AVM$TaxAmount)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 236865 374316 446777 571904 31682784 11205
df.AVM <- df.AVM %>%
group_by(apnMapBookPgNbr) %>%
mutate(meanTaxAmount = mean(TaxAmount, na.rm=TRUE))
df.AVM$TaxAmount[is.na(df.AVM$TaxAmount)] <- df.AVM$meanTaxAmount[is.na(df.AVM$TaxAmount)]
df.AVM <- ungroup(df.AVM)
df.AVM$meanTaxAmount <- NULL
1 A 2 A- 3 A+ 4 B 5 B- 6 B+ 7 C 8 C- 9 C+ 10 D 11 D- 12 D+ 13 E 14 E- 15 E+
table(df.AVM$BuildingQualityCode)
##
## 1 10 4 7
## 4048 90 136982 178533
df.AVM %>%
group_by(BuildingQualityCode) %>%
summarize(meanSalePrice = mean(CurrSalesPrice, na.rm=TRUE))
## # A tibble: 5 x 2
## BuildingQualityCode meanSalePrice
## <chr> <dbl>
## 1 1 1403912.
## 2 10 96990.
## 3 4 402031.
## 4 7 261261.
## 5 <NA> 288337.
#First impute by Tract
df.AVM <- df.AVM %>%
group_by(geoTract) %>%
mutate(modeBuildingQualityCode = Mode(BuildingQualityCode))
df.AVM$BuildingQualityCode[is.na(df.AVM$BuildingQualityCode)] <- df.AVM$modeBuildingQualityCode[is.na(df.AVM$BuildingQualityCode)]
table(is.na(df.AVM$BuildingQualityCode))
##
## FALSE TRUE
## 389723 223705
df.AVM$BuildingQualityCode[is.na(df.AVM$BuildingQualityCode)] <- "4"
df.AVM <- ungroup(df.AVM)
df.AVM$modeBuildingQualityCode <- NULL
df.AVM$FireplaceCode[is.na(df.AVM$FireplaceCode)] <- "0"
The only value present in the data is central (1). Can We assume that NA’s mean no AC? I’ll go ahead and google some apns with missing AC code. - After googling some apns it appears that all these properties are in Corona for whatever reason. It appears that they should in fact have AC so not sure why it’s missing, maybe something to do with the way city of Corona reports. Nearly all homes have AC in SoCal, I don’t expect this to correlate much with sale price, we’ll drop.
df.AVM %>%
filter(is.na(AirConditioningCode)) %>%
select(apn)
## # A tibble: 74,219 x 1
## apn
## <chr>
## 1 009000494
## 2 009000531
## 3 009000822
## 4 009000853
## 5 953021002
## 6 953021061
## 7 922022024
## 8 953030060
## 9 953042021
## 10 961241004
## # ... with 74,209 more rows
Only one value in the data, “3” central heating. Can We assume that if the NA’s are none? After googling some homes, probably. They seem to be missing in Corona too. Zero variance for the HeatCode variable for Riverside County, no need to use it in the model.
1 Baseboard 2 Electric 3 Central 4 Forced air unit 5 Oil 6 Floor/W 7 Gravity 8 Heat Pump 9 Geo-thermal 10 Hot Water 11 Gas 12 Partial 13 Radiant 14 None 15 Other 16 Steam 17 Coal 18 Space/Suspended 19 Convection 20 Solar 21 Vent 22 Wood Burning 23 Propane 24 Yes 25 Zone
table(df.AVM$HeatCode)
##
## 3
## 560308
df.AVM %>%
filter(is.na(HeatCode)) %>%
select(apn)
## # A tibble: 53,120 x 1
## apn
## <chr>
## 1 009000494
## 2 009000531
## 3 009000822
## 4 009000853
## 5 961241004
## 6 110063015
## 7 110083016
## 8 110084004
## 9 110084005
## 10 110084019
## # ... with 53,110 more rows
Value missing in the code book, stops at 28 and only value in data is 29. Maybe it still means something, so we’ll just recode missing to 0 and later create dummy.
df.AVM$StyleCode[is.na(df.AVM$StyleCode)] <- "0"
View missing data once more and keep only variables necessary for modeling.
Sum missing variables:
missingVar <- as.data.frame(sort(colSums(sapply(df.AVM, is.na)), decreasing = TRUE))
View(missingVar)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/modules/R_de.so'' had status 1
df.AVM$typeProperty <- df.AVM$StdLandUseCode
table(df.AVM$typeProperty)
##
## 1001 1004 1009 1010
## 544034 66264 2928 202
Let’s plot to see if different median sale price, I expect so.
ggplot(df.AVM[!is.na(df.AVM$CurrSalesPrice),], aes(x=as.factor(typeProperty), y=CurrSalesPrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue')+
scale_y_continuous(breaks= seq(0, 2000000, by=100000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..))
Interesting, I dind’t expect PUD (1009) to have the highest median sale price. We’ll factorize with no ordinality.
df.AVM$typeProperty.factor <- factor(df.AVM$typeProperty)
df.AVM$typeSale <- df.AVM$CurrSalesPriceCode
table(df.AVM$typeSale)
##
## 10 11 13 14 15 16 19 2 20 21
## 47 85 369098 9623 1586 32997 19 1 119 3
## 23 25 27 3 30 34 35 38 4 41
## 193 58 98749 2 51 7 1060 279 14 62
## 42 5 6 7 8
## 1 342 92 98863 77
#Replace NA to most common sale type
df.AVM$typeSale[is.na(df.AVM$CurrSalesPriceCode)] <- "27"
Factorize
df.AVM$typeSale.factor <- factor(df.AVM$typeSale)
df.AVM$typeGarage <- df.AVM$Garage
table(df.AVM$typeGarage)
##
## 1 10 11 3 4 7
## 514239 1 36201 22687 39659 641
Factorize, we can’t assume ordinality.
df.AVM$typeGarage.factor <- factor(df.AVM$typeGarage)
df.AVM$typeBuildingQuality <- df.AVM$BuildingQualityCode
table(df.AVM$typeBuildingQuality)
##
## 1 10 4 7
## 4048 90 394963 214327
Factorize: There are definitely levels to this so we should probably order, but let’s first check by plotting CurrSalePrice.
ggplot(df.AVM[!is.na(df.AVM$CurrSalesPrice),], aes(x=as.factor(typeBuildingQuality), y=CurrSalesPrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue')+
scale_y_continuous(breaks= seq(0, 1000000, by=100000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..))
Yea it’s ordinal, there aren’t many values for 10 though so let’s just revalue those to 7 then factorize.
df.AVM$typeBuildingQuality[df.AVM$typeBuildingQuality == "10"] <- "7"
df.AVM$typeBuildingQuality.factor <- factor(df.AVM$typeBuildingQuality, levels = c("7", "4", "1"), ordered = TRUE)
df.AVM$hasPool <- 0
df.AVM$hasPool[df.AVM$PoolCode == "2"] <- 1
df.AVM$hasPool[df.AVM$PoolCode == "8"] <- 1
df.AVM$hasJacuzzi <- 0
df.AVM$hasJacuzzi[df.AVM$PoolCode == "2"] <- 1
df.AVM$hasJacuzzi[df.AVM$PoolCode == "9"] <- 1
library(gmodels)
CrossTable(df.AVM$hasPool, df.AVM$hasJacuzzi)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 613428
##
##
## | df.AVM$hasJacuzzi
## df.AVM$hasPool | 0 | Row Total |
## ---------------|-----------|-----------|
## 0 | 484905 | 484905 |
## | 0.790 | |
## ---------------|-----------|-----------|
## 1 | 128523 | 128523 |
## | 0.210 | |
## ---------------|-----------|-----------|
## Column Total | 613428 | 613428 |
## ---------------|-----------|-----------|
##
##
Create typeView factor variable
df.AVM$typeView <- "No View"
df.AVM$typeView[df.AVM$SiteInfluenceCode == "None"] <- "No View"
df.AVM$typeView[df.AVM$SiteInfluenceCode == "G"] <- "Golf Course"
df.AVM$typeView[df.AVM$SiteInfluenceCode == "J"] <- "Golf Course"
df.AVM$typeView[df.AVM$SiteInfluenceCode == "V"] <- "Golf Course"
df.AVM$typeView[df.AVM$SiteInfluenceCode == "W"] <- "Waterfront"
df.AVM$typeView[df.AVM$SiteInfluenceCode == "L"] <- "Waterfront"
df.AVM$typeView[df.AVM$SiteInfluenceCode == "B"] <- "Waterfront"
Let’s see if there is ordinality, plot.
ggplot(df.AVM[!is.na(df.AVM$CurrSalesPrice),], aes(x=as.factor(typeView), y=CurrSalesPrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue')+
scale_y_continuous(breaks= seq(0, 1000000, by=100000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..))
Factorize. From my SME, being on the water is almost always better than living on a golf course, lets go ordinal.
df.AVM$typeView.factor <- factor(df.AVM$typeView, levels = c("No View", "Golf Course", "Waterfront"), ordered = TRUE)
df.AVM$numberStories <- 1
df.AVM$numberStories[df.AVM$StoriesNbrCode == "200"] <- 2
df.AVM$numberStories[df.AVM$StoriesNbrCode == "300"] <- 3
df.AVM$numberStories[df.AVM$StoriesNbrCode == "400"] <- 3
df.AVM$numberStories[df.AVM$StoriesNbrCode == "500"] <- 3
df.AVM$numberStories[df.AVM$StoriesNbrCode == "600"] <- 3
df.AVM$numberStories[df.AVM$StoriesNbrCode == "700"] <- 3
df.AVM$numberStories[df.AVM$StoriesNbrCode == "800"] <- 3
Factorize: Although it seems this should be ordinal, one story homes are often more desired so there is not necessarily a linear relationship between price and number of stories.
df.AVM$numberStories.factor <- factor(df.AVM$numberStories)
df.AVM$typeRoof <- df.AVM$RoofCoverCode
Factorize:
df.AVM$typeRoof.factor <- factor(df.AVM$typeRoof)
df.AVM$typeWater <- df.AVM$WaterCode
Factorize:
df.AVM$typeWater.factor <- factor(df.AVM$typeWater)
df.AVM$hasFireplace[df.AVM$FireplaceCode == "0"] <- 0
## Warning: Unknown or uninitialised column: 'hasFireplace'.
df.AVM$hasFireplace[df.AVM$FireplaceCode == "1"] <- 1
df.AVM$isCorp[is.na(df.AVM$OwnerStd1CorpInd)] <- 0
## Warning: Unknown or uninitialised column: 'isCorp'.
df.AVM$isCorp[df.AVM$OwnerStd1CorpInd == "T"] <- 1
table(df.AVM$isCorp)
##
## 0 1
## 586099 27329
df.AVM$isOwnerOcc[is.na(df.AVM$OwnerOccupied)] <- 0
## Warning: Unknown or uninitialised column: 'isOwnerOcc'.
df.AVM$isOwnerOcc[df.AVM$OwnerOccupied == "Y"] <- 1
table(df.AVM$isOwnerOcc)
##
## 0 1
## 158850 454578
Summarize CurrSalesPrice for all years
summary(df.AVM$CurrSalesPrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 165000 263000 316624 380000 65000000 88979
Plot histogram of CurrSalesPrice, obtain rows with CurrSalesPrice > 0 and CurrSalesPrice < 100000
ggplot(data=df.AVM[!is.na(df.AVM$CurrSalesPrice) & df.AVM$CurrSalesPrice > 0 & df.AVM$CurrSalesPrice < 1000000,], aes(x=CurrSalesPrice)) +
geom_histogram(fill="green", binwidth = 10000) +
scale_x_continuous(breaks= seq(0, 1000000, by=100000), labels = comma)
Plot histogram of CurrSalesPrice (High Value Homes), obtain rows with CurrSalesPrice > 1000000
ggplot(data=df.AVM[!is.na(df.AVM$CurrSalesPrice) & df.AVM$CurrSalesPrice > 1000000,], aes(x=CurrSalesPrice)) +
geom_histogram(fill="green", binwidth = 1000000) +
scale_x_continuous(breaks= seq(1000000, 50000000, by=1000000), labels = comma) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Let’s look at the median sale price by typeSale.
ggplot(df.AVM[!is.na(df.AVM$CurrSalesPrice) & df.AVM$CurrSalesPrice > 100000 & df.AVM$CurrSalesPrice < 1000000,], aes(x=typeSale.factor, y=CurrSalesPrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue')+
scale_y_continuous(breaks= seq(0, 1000000, by=100000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..))
Let’s first sample only new homes for modeling to remove annual price trend. 2017 is the last year for which there is complete data for.
df.AVM$daysElapsedSale <- Sys.Date() - as.Date(df.AVM$CurrSaleRecordingDate, "%Y%m%d")
df.AVM_train <- df.AVM %>%
filter(!is.na(yearSale) & daysElapsedSale < 730 & CurrSalesPrice != 0)
We can see some slight seasonality in the median sale price by month, with the summers months being the highest. This makes sense because typically there is more demand in the summer months, everyone wants to move when kids are out of school.
ys <- ggplot(df.AVM_train[!is.na(df.AVM_train$CurrSalesPrice),], aes(x=as.factor(yearSale), y=CurrSalesPrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue')+
scale_y_continuous(breaks= seq(0, 1000000, by=25000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..)) +
coord_cartesian(ylim = c(0, 500000)) +
geom_hline(yintercept=163000, linetype="dashed", color = "red") #dashed line is median SalePrice
ms <- ggplot(df.AVM_train[!is.na(df.AVM_train$CurrSalesPrice),], aes(x=monthSale, y=CurrSalesPrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue')+
scale_y_continuous(breaks= seq(0, 1000000, by=25000), labels = comma) +
geom_label(stat = "count", size = 2.5, aes(label = ..count.., y = ..count..)) +
coord_cartesian(ylim = c(0, 500000)) +
geom_hline(yintercept=163000, linetype="dashed", color = "red") #dashed line is median SalePrice
grid.arrange(ys, ms, widths=c(1,2))
numericVar <- which(sapply(df.AVM_train, is.numeric))
cat('There are', length(numericVar), 'numeric variables in the dataset, including the response variables CurrSalesPrice')
## There are 22 numeric variables in the dataset, including the response variables CurrSalesPrice
We can see from the below plot that there’s definitely some collinearity in the data, with AssdImprovementValue having the highest Pearson’s R value at .82 Note: other collinear variables which we may have to interact or transform to remove multicollinarity during subsequent modeling.
complete_numVar <- df.AVM_train[,numericVar]
#correlations of df.AVM_train numeric variables
cor_numVar <- cor(complete_numVar, use="pairwise.complete.obs")
## Warning in cor(complete_numVar, use = "pairwise.complete.obs"): the
## standard deviation is zero
#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_numVar[,'CurrSalesPrice'], decreasing = TRUE))
#select only high correlations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.1)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt", number.cex=.7)
ggplot(data=df.AVM_train[!is.na(df.AVM_train$CurrSalesPrice) & df.AVM_train$CurrSalesPrice < 1000000 & df.AVM_train$BuildingArea > 0 ,], aes(x=BuildingArea, y=CurrSalesPrice))+
geom_point(col='blue') + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
scale_y_continuous(labels = comma)
nb1 <- ggplot(df.AVM_train[!is.na(df.AVM_train$CurrSalesPrice),], aes(x=reorder(SitusStdZIP5, CurrSalesPrice, FUN=median), y=CurrSalesPrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue') + labs(x='Zip', y='Median Sale Price') +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5)) +
scale_y_continuous(breaks= seq(0, 1000000, by=100000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=1) +
geom_hline(yintercept=350000, linetype="dashed", color = "red") #dashed line is median SalePrice
nb2 <- ggplot(df.AVM_train[!is.na(df.AVM_train$CurrSalesPrice),], aes(x=reorder(SitusStdZIP5, CurrSalesPrice, FUN=median), y=CurrSalesPrice)) +
geom_bar(stat='summary', fun.y = "mean", fill='blue') + labs(x='Zip', y='Mean Sale Price') +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5)) +
scale_y_continuous(breaks= seq(0, 1000000, by=100000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=1) +
geom_hline(yintercept=350000, linetype="dashed", color = "red") #dashed line is mean SalePrice
grid.arrange(nb1, nb2)
tb1 <- ggplot(data=df.AVM_train[!is.na(df.AVM_train$CurrSalesPrice) & df.AVM_train$CurrSalesPrice < 1000000 & df.AVM_train$BathTotalCalc > 0 ,], aes(x=as.factor(BathTotalCalc), y=CurrSalesPrice))+
geom_point(col='blue') + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
scale_y_continuous(labels = comma)
tb2 <- ggplot(data=df.AVM_train, aes(x=as.factor(BathTotalCalc))) +
geom_histogram(stat='count')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
grid.arrange(tb1, tb2)
Median Sale Price by Geography Geographic hierarchy is the following:
Let’s first obtain a median sale price then standard percent rank by zip code.
priceZip <- df.AVM_train %>%
group_by(SitusStdZIP5) %>%
summarise(medianZipPrice = median(CurrSalesPrice))
priceZip <- priceZip %>%
mutate(percentileZipPrice = percent_rank(medianZipPrice))
priceTract <- df.AVM_train %>%
group_by(geoTract) %>%
summarise(medianTractPrice = median(CurrSalesPrice))
priceTract <- priceTract %>%
mutate(percentileTractPrice = percent_rank(medianTractPrice))
priceBlockGroup <- df.AVM_train %>%
group_by(geoBlockGroup) %>%
summarise(medianBlockGroupPrice = median(CurrSalesPrice))
priceBlockGroup <- priceBlockGroup %>%
mutate(percentileBlockGroupPrice = percent_rank(medianBlockGroupPrice))
priceBlock <- df.AVM_train %>%
group_by(geoBlock) %>%
summarise(medianBlockPrice = median(CurrSalesPrice))
priceBlock <- priceBlock %>%
mutate(percentileBlockPrice = percent_rank(medianBlockPrice))
priceBlock$blockPriceIndex <- round(priceBlock$percentileBlockPrice * 10)
priceBlock$blockPriceIndex.factor <- factor(priceBlock$blockPriceIndex, ordered = TRUE)
ggplot(data=priceBlock[priceBlock$medianBlockPrice < 1000000,], aes(x=as.numeric(medianBlockPrice))) +
geom_histogram(stat='count', binwidth = 5) + scale_x_continuous(labels = comma)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
df.AVM <- left_join(df.AVM, priceZip, by = "SitusStdZIP5")
df.AVM <- left_join(df.AVM, priceTract, by = "geoTract")
df.AVM <- left_join(df.AVM, priceBlockGroup, by = "geoBlockGroup")
df.AVM <- left_join(df.AVM, priceBlock, by = "geoBlock")
df.AVM <- df.AVM %>%
select("apn", "CurrSalesPrice", "SumResidentialUnits", "GarageParkingNbr", "BathTotalCalc", "Bedrooms", "BuildingArea", "AssdImprovementValue", "AssdLandValue", "LotSizeSqFt", "SitusStdZIP5", "YearBuilt", "SumGarageSqFt", "TaxAmount", "percentileBlockGroupPrice", "percentileBlockPrice", "yearSale", "monthSale", "percentileTractPrice", "percentileZipPrice", "daysElapsedSale", contains(".factor"), contains("has"), contains("is"))
Sum missing variables:
missingVar <- as.data.frame(sort(colSums(sapply(df.AVM, is.na)), decreasing = TRUE))
View(missingVar)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/modules/R_de.so'' had status 1
We do have some missing data for geo price indices, replace geoBlockGroup pricePercentile with geoTractpercentile, replace geoTractPercentile with geoZipPercentile. If geoZip Perencetile is missing, drop.
df.AVM$percentileBlockPrice[is.na(df.AVM$percentileBlockPrice)] <- df.AVM$percentileBlockGroupPrice
## Warning in df.AVM$percentileBlockPrice[is.na(df.AVM$percentileBlockPrice)]
## <- df.AVM$percentileBlockGroupPrice: number of items to replace is not a
## multiple of replacement length
df.AVM$percentileBlockGroupPrice[is.na(df.AVM$percentileBlockGroupPrice)] <- df.AVM$percentileTractPrice
## Warning in
## df.AVM$percentileBlockGroupPrice[is.na(df.AVM$percentileBlockGroupPrice)]
## <- df.AVM$percentileTractPrice: number of items to replace is not a
## multiple of replacement length
df.AVM$percentileTractPrice[is.na(df.AVM$percentileTractPrice)] <- df.AVM$percentileZipPrice
## Warning in df.AVM$percentileTractPrice[is.na(df.AVM$percentileTractPrice)]
## <- df.AVM$percentileZipPrice: number of items to replace is not a multiple
## of replacement length
df.AVM$percentileBlockPrice[is.na(df.AVM$percentileBlockPrice)] <- df.AVM$percentileBlockGroupPrice
## Warning in df.AVM$percentileBlockPrice[is.na(df.AVM$percentileBlockPrice)]
## <- df.AVM$percentileBlockGroupPrice: number of items to replace is not a
## multiple of replacement length
df.AVM$zipPriceIndex <- round(df.AVM$percentileZipPrice * 10)
df.AVM$zipPriceIndex.factor <- factor(df.AVM$zipPriceIndex, ordered = TRUE)
df.AVM$tractPriceIndex <- round(df.AVM$percentileTractPrice * 10)
df.AVM$tractPriceIndex.factor <- factor(df.AVM$tractPriceIndex, ordered = TRUE)
df.AVM$blockGroupPriceIndex <- round(df.AVM$percentileBlockGroupPrice * 10)
df.AVM$blockGroupPriceIndex.factor <- factor(df.AVM$blockGroupPriceIndex, ordered = TRUE)
df.AVM$blockPriceIndex <- round(df.AVM$percentileBlockPrice * 10)
df.AVM$blockPriceIndex.factor <- factor(df.AVM$blockPriceIndex, ordered = TRUE)
Sum missing variables:
missingVar <- as.data.frame(sort(colSums(sapply(df.AVM, is.na)), decreasing = TRUE))
View(missingVar)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/modules/R_de.so'' had status 1
We’ll just drop all other missing data.
df.AVM <- df.AVM %>%
filter(!is.na(SitusStdZIP5) & !is.na(YearBuilt) & !is.na(TaxAmount) & !is.na(percentileZipPrice) & !is.na(LotSizeSqFt))
Generate neighborhood price index, factorize.
df.AVM$priceIndex <- round(df.AVM$percentileBlockPrice * 10)
df.AVM$priceIndex.factor <- factor(df.AVM$priceIndex, ordered = TRUE)
Let’s filter out home prices above $1million, these homes exhibit different valuation criteria and we will consider as outliers. We can fit a model specifically for these high value homes at a later time. The minimum value is also of concern, 330. No home sells for only three hundred and thirty dollars, this is most likely an error so we’ll drop.
df.AVM <- df.AVM %>%
mutate(pricePct = percent_rank(CurrSalesPrice))
df.AVM_outcome <- df.AVM %>%
filter(daysElapsedSale < 730 & pricePct < .99 & CurrSalesPrice > 100000 ) %>%
select(apn, CurrSalesPrice, yearSale, monthSale)
df.AVM$yearSale <- NULL
df.AVM$monthSale <-NULL
df.AVM$CurrSalesPrice <- NULL
df.AVM <- left_join(df.AVM, df.AVM_outcome, by="apn")
summary(df.AVM$CurrSalesPrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 100424 282500 365000 389185 455500 1237000 527362
rm(df.AVM_train)
We do this in order to create the temporal dummy for current year and month to predict current market price.
df.AVM$monthSale[is.na(df.AVM$monthSale)] <- "09"
df.AVM$yearSale[is.na(df.AVM$yearSale)] <- "2018"
numericPredictors <- c("BuildingArea", "AssdImprovementValue", "AssdLandValue", "LotSizeSqFt", "SumGarageSqFt", "TaxAmount", "propertyAge")
DFnumeric <- df.AVM[, names(df.AVM) %in% numericPredictors]
if (FIPS == "06073" | FIPS == "06037") {
DFnumeric$SumGarageSqFt <- NULL
}
DFfactors <- df.AVM[, c("typeProperty.factor", "typeGarage.factor", "zipPriceIndex.factor", "tractPriceIndex.factor", "blockGroupPriceIndex.factor", "blockPriceIndex.factor","numberStories.factor", "yearSale", "monthSale")]
cat('There are', length(DFnumeric), 'numeric variables, and', length(DFfactors), 'factor variables')
## There are 6 numeric variables, and 9 factor variables
Let’s go ahead and take the log of any numeric variables with an absolute skewness above .8
for(i in 1:ncol(DFnumeric)){
if (abs(skew(DFnumeric[,i]))>0.8){
DFnumeric[,i] <- log(DFnumeric[,i] +1)
}
}
PreNum <- preProcess(DFnumeric, method=c("center", "scale"))
print(PreNum)
## Created from 602817 samples and 6 variables
##
## Pre-processing:
## - centered (6)
## - ignored (0)
## - scaled (6)
DFnorm <- predict(PreNum, DFnumeric)
dim(DFnorm)
## [1] 602817 6
There are some categorical variables we already created dummies for. The remaining factor vars we’ll one-hot encode using the model.matrix function.
DFdummies <- as.data.frame(model.matrix(~.-1, DFfactors))
dim(DFdummies)
## [1] 602817 64
Skewed right.
skew(df.AVM$CurrSalesPrice)
## [1] 1.432996
qqnorm(df.AVM$CurrSalesPrice)
qqline(df.AVM$CurrSalesPrice)
Let’s take the natural logarithm of CurrSalesPrice.
df.AVM$CurrSalesPrice <- log(df.AVM$CurrSalesPrice)
skew(df.AVM$CurrSalesPrice)
## [1] -0.1323841
df.combined <- cbind(DFnorm, DFdummies, df.AVM[,c("SumResidentialUnits", "GarageParkingNbr", "BathTotalCalc", "Bedrooms")])
set.seed(34567)
ID <- df.AVM$apn
outcome <- df.AVM$CurrSalesPrice[!is.na(df.AVM$CurrSalesPrice)]
train1 <- df.combined[!is.na(df.AVM$CurrSalesPrice),]
test1 <- df.combined[is.na(df.AVM$CurrSalesPrice),]
complete1 <- rbind(train1, test1)
lm_model_15 <- lm(outcome ~ ., data=train1)
summary(lm_model_15)
##
## Call:
## lm(formula = outcome ~ ., data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.29442 -0.07909 -0.00120 0.07995 1.74317
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.234e+01 3.437e-02 358.866 < 2e-16
## BuildingArea -2.072e-02 1.026e-03 -20.191 < 2e-16
## AssdImprovementValue -4.673e-03 7.386e-04 -6.326 2.53e-10
## AssdLandValue 7.442e-02 1.475e-03 50.464 < 2e-16
## LotSizeSqFt 4.466e-03 7.945e-04 5.622 1.90e-08
## SumGarageSqFt 4.159e-02 4.441e-03 9.366 < 2e-16
## TaxAmount 7.810e-02 1.511e-03 51.678 < 2e-16
## typeProperty.factor1001 6.192e-03 3.361e-02 0.184 0.853846
## typeProperty.factor1004 -9.682e-02 3.356e-02 -2.885 0.003920
## typeProperty.factor1009 2.941e-02 3.420e-02 0.860 0.389831
## typeProperty.factor1010 NA NA NA NA
## typeGarage.factor10 NA NA NA NA
## typeGarage.factor11 1.696e-01 1.371e-02 12.372 < 2e-16
## typeGarage.factor3 1.061e-01 1.451e-02 7.313 2.64e-13
## typeGarage.factor4 1.036e-02 3.249e-03 3.188 0.001431
## typeGarage.factor7 5.420e-02 2.029e-02 2.671 0.007566
## zipPriceIndex.factor.L 1.358e-01 8.920e-03 15.224 < 2e-16
## zipPriceIndex.factor.Q -5.224e-02 8.389e-03 -6.227 4.79e-10
## zipPriceIndex.factor.C -1.278e-02 7.322e-03 -1.746 0.080851
## `zipPriceIndex.factor^4` -5.001e-03 5.942e-03 -0.842 0.400020
## `zipPriceIndex.factor^5` -9.194e-03 4.558e-03 -2.017 0.043663
## `zipPriceIndex.factor^6` -8.224e-03 3.489e-03 -2.358 0.018400
## `zipPriceIndex.factor^7` -5.063e-03 2.723e-03 -1.860 0.062947
## `zipPriceIndex.factor^8` 1.203e-02 2.349e-03 5.119 3.07e-07
## `zipPriceIndex.factor^9` -3.100e-02 2.150e-03 -14.423 < 2e-16
## `zipPriceIndex.factor^10` -2.127e-02 2.232e-03 -9.528 < 2e-16
## tractPriceIndex.factor.L -2.005e-03 7.697e-03 -0.260 0.794511
## tractPriceIndex.factor.Q -6.782e-03 5.773e-03 -1.175 0.240035
## tractPriceIndex.factor.C -1.120e-02 4.922e-03 -2.275 0.022906
## `tractPriceIndex.factor^4` -1.660e-02 3.921e-03 -4.235 2.29e-05
## `tractPriceIndex.factor^5` 1.100e-02 3.537e-03 3.109 0.001880
## `tractPriceIndex.factor^6` -1.049e-02 3.039e-03 -3.454 0.000553
## `tractPriceIndex.factor^7` -9.491e-03 2.644e-03 -3.590 0.000331
## `tractPriceIndex.factor^8` 1.905e-03 2.501e-03 0.762 0.446311
## `tractPriceIndex.factor^9` 1.005e-02 2.463e-03 4.082 4.46e-05
## `tractPriceIndex.factor^10` -8.031e-03 2.287e-03 -3.512 0.000445
## blockGroupPriceIndex.factor.L 5.649e-02 7.734e-03 7.304 2.82e-13
## blockGroupPriceIndex.factor.Q -1.696e-02 5.812e-03 -2.918 0.003521
## blockGroupPriceIndex.factor.C 5.557e-02 4.914e-03 11.309 < 2e-16
## `blockGroupPriceIndex.factor^4` -5.365e-04 3.988e-03 -0.135 0.892991
## `blockGroupPriceIndex.factor^5` 1.083e-03 3.469e-03 0.312 0.754829
## `blockGroupPriceIndex.factor^6` -5.862e-03 2.974e-03 -1.971 0.048690
## `blockGroupPriceIndex.factor^7` -5.980e-04 2.615e-03 -0.229 0.819130
## `blockGroupPriceIndex.factor^8` -6.995e-03 2.477e-03 -2.824 0.004746
## `blockGroupPriceIndex.factor^9` -1.801e-03 2.426e-03 -0.742 0.457955
## `blockGroupPriceIndex.factor^10` 4.379e-03 2.269e-03 1.930 0.053590
## blockPriceIndex.factor.L 8.677e-01 6.015e-03 144.258 < 2e-16
## blockPriceIndex.factor.Q 2.259e-02 4.402e-03 5.132 2.87e-07
## blockPriceIndex.factor.C 1.774e-01 3.837e-03 46.242 < 2e-16
## `blockPriceIndex.factor^4` 4.710e-03 3.272e-03 1.440 0.149966
## `blockPriceIndex.factor^5` 2.431e-02 2.845e-03 8.546 < 2e-16
## `blockPriceIndex.factor^6` 9.072e-03 2.523e-03 3.596 0.000323
## `blockPriceIndex.factor^7` -9.398e-03 2.338e-03 -4.020 5.83e-05
## `blockPriceIndex.factor^8` 1.213e-02 2.200e-03 5.511 3.57e-08
## `blockPriceIndex.factor^9` 5.619e-03 2.159e-03 2.603 0.009237
## `blockPriceIndex.factor^10` -4.932e-03 2.110e-03 -2.337 0.019419
## numberStories.factor2 -3.050e-02 1.874e-03 -16.272 < 2e-16
## numberStories.factor3 -1.496e-02 6.921e-03 -2.161 0.030671
## yearSale2017 6.146e-02 4.392e-03 13.995 < 2e-16
## yearSale2018 1.612e-01 4.650e-03 34.666 < 2e-16
## monthSale02 1.221e-02 3.824e-03 3.193 0.001408
## monthSale03 1.716e-02 3.539e-03 4.849 1.24e-06
## monthSale04 2.982e-02 3.573e-03 8.345 < 2e-16
## monthSale05 2.709e-02 3.503e-03 7.732 1.07e-14
## monthSale06 3.575e-02 3.485e-03 10.257 < 2e-16
## monthSale07 3.985e-02 3.574e-03 11.150 < 2e-16
## monthSale08 4.019e-02 3.529e-03 11.389 < 2e-16
## monthSale09 4.043e-02 3.681e-03 10.983 < 2e-16
## monthSale10 4.362e-02 3.649e-03 11.954 < 2e-16
## monthSale11 5.269e-02 4.131e-03 12.755 < 2e-16
## monthSale12 6.207e-02 4.282e-03 14.494 < 2e-16
## SumResidentialUnits -2.323e-04 1.777e-05 -13.072 < 2e-16
## GarageParkingNbr 2.304e-02 1.711e-03 13.462 < 2e-16
## BathTotalCalc 7.130e-02 1.777e-03 40.126 < 2e-16
## Bedrooms 1.393e-02 1.071e-03 13.008 < 2e-16
##
## (Intercept) ***
## BuildingArea ***
## AssdImprovementValue ***
## AssdLandValue ***
## LotSizeSqFt ***
## SumGarageSqFt ***
## TaxAmount ***
## typeProperty.factor1001
## typeProperty.factor1004 **
## typeProperty.factor1009
## typeProperty.factor1010
## typeGarage.factor10
## typeGarage.factor11 ***
## typeGarage.factor3 ***
## typeGarage.factor4 **
## typeGarage.factor7 **
## zipPriceIndex.factor.L ***
## zipPriceIndex.factor.Q ***
## zipPriceIndex.factor.C .
## `zipPriceIndex.factor^4`
## `zipPriceIndex.factor^5` *
## `zipPriceIndex.factor^6` *
## `zipPriceIndex.factor^7` .
## `zipPriceIndex.factor^8` ***
## `zipPriceIndex.factor^9` ***
## `zipPriceIndex.factor^10` ***
## tractPriceIndex.factor.L
## tractPriceIndex.factor.Q
## tractPriceIndex.factor.C *
## `tractPriceIndex.factor^4` ***
## `tractPriceIndex.factor^5` **
## `tractPriceIndex.factor^6` ***
## `tractPriceIndex.factor^7` ***
## `tractPriceIndex.factor^8`
## `tractPriceIndex.factor^9` ***
## `tractPriceIndex.factor^10` ***
## blockGroupPriceIndex.factor.L ***
## blockGroupPriceIndex.factor.Q **
## blockGroupPriceIndex.factor.C ***
## `blockGroupPriceIndex.factor^4`
## `blockGroupPriceIndex.factor^5`
## `blockGroupPriceIndex.factor^6` *
## `blockGroupPriceIndex.factor^7`
## `blockGroupPriceIndex.factor^8` **
## `blockGroupPriceIndex.factor^9`
## `blockGroupPriceIndex.factor^10` .
## blockPriceIndex.factor.L ***
## blockPriceIndex.factor.Q ***
## blockPriceIndex.factor.C ***
## `blockPriceIndex.factor^4`
## `blockPriceIndex.factor^5` ***
## `blockPriceIndex.factor^6` ***
## `blockPriceIndex.factor^7` ***
## `blockPriceIndex.factor^8` ***
## `blockPriceIndex.factor^9` **
## `blockPriceIndex.factor^10` *
## numberStories.factor2 ***
## numberStories.factor3 *
## yearSale2017 ***
## yearSale2018 ***
## monthSale02 **
## monthSale03 ***
## monthSale04 ***
## monthSale05 ***
## monthSale06 ***
## monthSale07 ***
## monthSale08 ***
## monthSale09 ***
## monthSale10 ***
## monthSale11 ***
## monthSale12 ***
## SumResidentialUnits ***
## GarageParkingNbr ***
## BathTotalCalc ***
## Bedrooms ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1909 on 75382 degrees of freedom
## Multiple R-squared: 0.7764, Adjusted R-squared: 0.7761
## F-statistic: 3634 on 72 and 75382 DF, p-value: < 2.2e-16
prediction <- predict(lm_model_15, train1, type="response")
## Warning in predict.lm(lm_model_15, train1, type = "response"): prediction
## from a rank-deficient fit may be misleading
model_output_lm <- cbind(train1, prediction)
#Test with RMSE
rmse(outcome,model_output_lm$prediction)
## [1] 0.1908095
Setup grid for tuning parameters:
xgb_grid = expand.grid(
nrounds = 1000,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1
)
Use 5 fold cross validation and let caret find best hyperparameter values. Warning, very computationally expensive!
#xgb_caret <- train(x=train1[,-95], y=label_train, method='xgbTree', trControl= my_control, tuneGrid=xgb_grid)
#xgb_caret$bestTune
Above results yield optimal hyperparameter values as: Max Depth = 6 eta = .05 min_child_weight = 4
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = as.matrix(train1), label= as.numeric(outcome))
dtest <- xgb.DMatrix(data = as.matrix(test1))
default_param <-list(
objective = "reg:linear",
booster = "gbtree",
eta=0.05, #default = 0.3
gamma=0,
max_depth=6, #default=6
min_child_weight=4, #default=1
subsample=1,
colsample_bytree=1
)
Perform cross validation and determine best number of rounds for the above default parameters.
xgbcv <- xgb.cv( params = default_param, data = dtrain, nrounds = 500, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)
## [1] train-rmse:11.683326+0.000603 test-rmse:11.683329+0.002479
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [41] train-rmse:1.510782+0.000086 test-rmse:1.510873+0.001871
## [81] train-rmse:0.241245+0.000140 test-rmse:0.244040+0.001588
## [121] train-rmse:0.140788+0.000151 test-rmse:0.147997+0.001093
## [161] train-rmse:0.135424+0.000281 test-rmse:0.144660+0.001150
## [201] train-rmse:0.133035+0.000292 test-rmse:0.143860+0.001169
## [241] train-rmse:0.130998+0.000360 test-rmse:0.143350+0.001193
## [281] train-rmse:0.129163+0.000339 test-rmse:0.142953+0.001231
## [321] train-rmse:0.127567+0.000383 test-rmse:0.142693+0.001208
## [361] train-rmse:0.126076+0.000381 test-rmse:0.142454+0.001199
## [401] train-rmse:0.124580+0.000332 test-rmse:0.142299+0.001186
## [441] train-rmse:0.123117+0.000328 test-rmse:0.142148+0.001210
## [481] train-rmse:0.121805+0.000350 test-rmse:0.142062+0.001207
## [500] train-rmse:0.121184+0.000377 test-rmse:0.142029+0.001229
Now let’s train the model using the best iteration found by cross validation.
xgb_mod <- xgb.train(data = dtrain, params=default_param, nrounds = 481)
Since we’re predicting the outcome of a traditional sale at the current month and year, replace fields:
df.combined$yearSale2018 <- 1
df.combined$yearSale2017 <- 0
df.combined$monthSale02 <- 0
df.combined$monthSale03 <- 0
df.combined$monthSale04 <- 0
df.combined$monthSale05 <- 0
df.combined$monthSale06 <- 0
df.combined$monthSale07 <- 0
df.combined$monthSale08 <- 0
df.combined$monthSale09 <- 0
df.combined$monthSale10 <- 0
df.combined$monthSale11 <- 1
df.combined$monthSale12 <- 0
dcomplete <- xgb.DMatrix(data = as.matrix(df.combined))
#Predict on the combined set
XGBpred <- predict(xgb_mod, dcomplete)
#need to reverse the log to the real values
predictions_XGB <- as.data.frame(exp(XGBpred))
head(predictions_XGB)
## exp(XGBpred)
## 1 746987.5
## 2 760101.5
## 3 665690.9
## 4 989448.5
## 5 766393.4
## 6 844878.3
priceCurrentEst <- as.data.frame(cbind(predictions_XGB, ID))
priceCurrentEst$priceCurrentEst <- priceCurrentEst$`exp(XGBpred)`
priceCurrentEst$priceCurrentEst <- round(priceCurrentEst$priceCurrentEst)
priceCurrentEst$apn <- as.character(priceCurrentEst$ID)
priceCurrentEst$`exp(XGBpred)` <- NULL
priceCurrentEst$ID <- NULL
#view variable importance plot
library(Ckmeans.1d.dp) #required for ggplot clustering
mat <- xgb.importance (feature_names = colnames(train1),model = xgb_mod)
xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE)