This is a guide through my exploration of the Kaggle Zillow Round One dataset. This is my first kaggle competition.
I will do a thorough write-up of my exploration and predictive modeling, following the completion of Round 1.
######################## Set Environment
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggmap)
library(OpenStreetMap)
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
######################## Pre Process
# Import train data
train <- read.csv("Data/train_2016.csv")
# Reclassify
train$parcelid <- as.character(train$parcelid) # Convert to character
train$transactiondate <- ymd(train$transactiondate) # Make class date
# How many times same parcel is sold more than once
train %>% group_by(parcelid) %>% summarise(freq = n()) %>% filter(freq > 1) %>%
summarise(total = n())
## Warning: package 'bindrcpp' was built under R version 3.3.3
## # A tibble: 1 x 1
## total
## <int>
## 1 128
# Remove repeated transactions
train$rpt <- ave(train$parcelid == train$parcelid, train$parcelid, FUN = cumsum)
train <- train[train$rpt == 1, ]
train$rpt <- NULL
### Property Data
# Import property data
properties <- read.csv("Data/properties_2016.csv")
summary(properties)
## parcelid airconditioningtypeid architecturalstyletypeid
## Min. : 10711725 Min. : 1.0 Min. : 2.0
## 1st Qu.: 11643707 1st Qu.: 1.0 1st Qu.: 7.0
## Median : 12545094 Median : 1.0 Median : 7.0
## Mean : 13325858 Mean : 1.9 Mean : 7.2
## 3rd Qu.: 14097122 3rd Qu.: 1.0 3rd Qu.: 7.0
## Max. :169601949 Max. :13.0 Max. :27.0
## NA's :2173698 NA's :2979156
## basementsqft bathroomcnt bedroomcnt buildingclasstypeid
## Min. : 20.0 Min. : 0.000 Min. : 0.000 Min. :1.0
## 1st Qu.: 272.0 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.:3.0
## Median : 534.0 Median : 2.000 Median : 3.000 Median :4.0
## Mean : 646.9 Mean : 2.209 Mean : 3.089 Mean :3.7
## 3rd Qu.: 847.2 3rd Qu.: 3.000 3rd Qu.: 4.000 3rd Qu.:4.0
## Max. :8516.0 Max. :20.000 Max. :20.000 Max. :5.0
## NA's :2983589 NA's :11462 NA's :11450 NA's :2972588
## buildingqualitytypeid calculatedbathnbr decktypeid
## Min. : 1.0 Min. : 1.0 Min. :66
## 1st Qu.: 4.0 1st Qu.: 2.0 1st Qu.:66
## Median : 7.0 Median : 2.0 Median :66
## Mean : 5.8 Mean : 2.3 Mean :66
## 3rd Qu.: 7.0 3rd Qu.: 3.0 3rd Qu.:66
## Max. :12.0 Max. :20.0 Max. :66
## NA's :1046729 NA's :128912 NA's :2968121
## finishedfloor1squarefeet calculatedfinishedsquarefeet
## Min. : 3 Min. : 1
## 1st Qu.: 1012 1st Qu.: 1213
## Median : 1283 Median : 1572
## Mean : 1381 Mean : 1827
## 3rd Qu.: 1615 3rd Qu.: 2136
## Max. :31303 Max. :952576
## NA's :2782500 NA's :55565
## finishedsquarefeet12 finishedsquarefeet13 finishedsquarefeet15
## Min. : 1 Min. : 120 Min. : 112
## 1st Qu.: 1196 1st Qu.: 960 1st Qu.: 1694
## Median : 1539 Median :1296 Median : 2172
## Mean : 1760 Mean :1179 Mean : 2739
## 3rd Qu.: 2070 3rd Qu.:1440 3rd Qu.: 2976
## Max. :290345 Max. :2688 Max. :820242
## NA's :276033 NA's :2977545 NA's :2794419
## finishedsquarefeet50 finishedsquarefeet6 fips
## Min. : 3 Min. : 117 Min. :6037
## 1st Qu.: 1013 1st Qu.: 1079 1st Qu.:6037
## Median : 1284 Median : 1992 Median :6037
## Mean : 1389 Mean : 2414 Mean :6048
## 3rd Qu.: 1618 3rd Qu.: 3366 3rd Qu.:6059
## Max. :31303 Max. :952576 Max. :6111
## NA's :2782500 NA's :2963216 NA's :11437
## fireplacecnt fullbathcnt garagecarcnt garagetotalsqft
## Min. :1.0 Min. : 1.00 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.0 1st Qu.: 2.00 1st Qu.: 2.0 1st Qu.: 324.0
## Median :1.0 Median : 2.00 Median : 2.0 Median : 441.0
## Mean :1.2 Mean : 2.24 Mean : 1.8 Mean : 383.8
## 3rd Qu.:1.0 3rd Qu.: 3.00 3rd Qu.: 2.0 3rd Qu.: 494.0
## Max. :9.0 Max. :20.00 Max. :25.0 Max. :7749.0
## NA's :2672580 NA's :128912 NA's :2101950 NA's :2101950
## hashottuborspa heatingorsystemtypeid latitude
## :2916203 Min. : 1 Min. :33324388
## true: 69014 1st Qu.: 2 1st Qu.:33827685
## Median : 2 Median :34008249
## Mean : 4 Mean :34001469
## 3rd Qu.: 7 3rd Qu.:34161860
## Max. :24 Max. :34819650
## NA's :1178816 NA's :11437
## longitude lotsizesquarefeet poolcnt
## Min. :-119475780 Min. : 100 Min. :1
## 1st Qu.:-118392983 1st Qu.: 5688 1st Qu.:1
## Median :-118172540 Median : 7000 Median :1
## Mean :-118201934 Mean : 22823 Mean :1
## 3rd Qu.:-117949468 3rd Qu.: 9898 3rd Qu.:1
## Max. :-117554316 Max. :328263808 Max. :1
## NA's :11437 NA's :276099 NA's :2467683
## poolsizesum pooltypeid10 pooltypeid2 pooltypeid7
## Min. : 19.0 Min. :1 Min. :1 Min. :1
## 1st Qu.: 430.0 1st Qu.:1 1st Qu.:1 1st Qu.:1
## Median : 495.0 Median :1 Median :1 Median :1
## Mean : 519.7 Mean :1 Mean :1 Mean :1
## 3rd Qu.: 594.0 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1
## Max. :17410.0 Max. :1 Max. :1 Max. :1
## NA's :2957257 NA's :2948278 NA's :2953142 NA's :2499758
## propertycountylandusecode propertylandusetypeid propertyzoningdesc
## 0100 :1153896 Min. : 31 :1006588
## 122 : 522145 1st Qu.:261 LAR1 : 275029
## 0101 : 247494 Median :261 LAR3 : 67105
## 010C : 225410 Mean :260 LARS : 54859
## 1111 : 126491 3rd Qu.:261 LBR1N : 52750
## 34 : 123249 Max. :275 LAR2 : 48808
## (Other): 586532 NA's :11437 (Other):1480078
## rawcensustractandblock regionidcity regionidcounty
## Min. :60371011 Min. : 3491 Min. :1286
## 1st Qu.:60373203 1st Qu.: 12447 1st Qu.:2061
## Median :60375712 Median : 25218 Median :3101
## Mean :60483450 Mean : 34993 Mean :2570
## 3rd Qu.:60590423 3rd Qu.: 45457 3rd Qu.:3101
## Max. :61110091 Max. :396556 Max. :3101
## NA's :11437 NA's :62845 NA's :11437
## regionidneighborhood regionidzip roomcnt storytypeid
## Min. : 6952 Min. : 95982 Min. : 0.000 Min. :7
## 1st Qu.: 46736 1st Qu.: 96180 1st Qu.: 0.000 1st Qu.:7
## Median :118920 Median : 96377 Median : 0.000 Median :7
## Mean :193476 Mean : 96553 Mean : 1.475 Mean :7
## 3rd Qu.:274800 3rd Qu.: 96974 3rd Qu.: 0.000 3rd Qu.:7
## Max. :764167 Max. :399675 Max. :96.000 Max. :7
## NA's :1828815 NA's :13980 NA's :11475 NA's :2983593
## threequarterbathnbr typeconstructiontypeid unitcnt
## Min. :1 Min. : 4 Min. : 1.0
## 1st Qu.:1 1st Qu.: 6 1st Qu.: 1.0
## Median :1 Median : 6 Median : 1.0
## Mean :1 Mean : 6 Mean : 1.2
## 3rd Qu.:1 3rd Qu.: 6 3rd Qu.: 1.0
## Max. :7 Max. :13 Max. :997.0
## NA's :2673586 NA's :2978470 NA's :1007727
## yardbuildingsqft17 yardbuildingsqft26 yearbuilt numberofstories
## Min. : 10.0 Min. : 10.0 Min. :1801 Min. : 1.0
## 1st Qu.: 190.0 1st Qu.: 96.0 1st Qu.:1950 1st Qu.: 1.0
## Median : 270.0 Median : 168.0 Median :1963 Median : 1.0
## Mean : 319.8 Mean : 278.3 Mean :1964 Mean : 1.4
## 3rd Qu.: 390.0 3rd Qu.: 320.0 3rd Qu.:1981 3rd Qu.: 2.0
## Max. :7983.0 Max. :6141.0 Max. :2015 Max. :41.0
## NA's :2904862 NA's :2982570 NA's :59928 NA's :2303148
## fireplaceflag structuretaxvaluedollarcnt taxvaluedollarcnt
## :2980054 Min. : 1 Min. : 1
## true: 5163 1st Qu.: 74800 1st Qu.: 179675
## Median : 122590 Median : 306086
## Mean : 170884 Mean : 420479
## 3rd Qu.: 196889 3rd Qu.: 488000
## Max. :251486000 Max. :282786000
## NA's :54982 NA's :42550
## assessmentyear landtaxvaluedollarcnt taxamount
## Min. :2000 Min. : 1 Min. : 1
## 1st Qu.:2015 1st Qu.: 74836 1st Qu.: 2461
## Median :2015 Median : 167042 Median : 3992
## Mean :2015 Mean : 252478 Mean : 5378
## 3rd Qu.:2015 3rd Qu.: 306918 3rd Qu.: 6201
## Max. :2016 Max. :90246219 Max. :3458861
## NA's :11439 NA's :67733 NA's :31250
## taxdelinquencyflag taxdelinquencyyear censustractandblock
## :2928755 Min. : 0.0 Min. :-1.000e+00
## Y: 56462 1st Qu.:14.0 1st Qu.: 6.037e+13
## Median :14.0 Median : 6.038e+13
## Mean :13.9 Mean : 6.048e+13
## 3rd Qu.:15.0 3rd Qu.: 6.059e+13
## Max. :99.0 Max. : 4.830e+14
## NA's :2928753 NA's :75126
# Reclassify
properties <- within(properties, {
parcelid <- as.character(parcelid)
cat("parcelid\n")
airconditioningtypeid <- as.factor(airconditioningtypeid)
cat("airconditiontypeid\n")
buildingclasstypeid <- as.factor(buildingclasstypeid)
cat("buildingclasstypeid\n")
buildingqualitytypeid <- as.factor(buildingqualitytypeid)
cat("buildingqualitytypeid\n")
decktypeid <- as.factor(decktypeid)
cat("decktypeid\n")
fips <- as.factor(fips)
cat("fips\n")
hashottuborspa <- as.character(hashottuborspa)
cat("hashottuborspa0\n")
hashottuborspa[hashottuborspa == ""] <- "false"
cat("hashottuborspa1\n")
hashottuborspa <- as.factor(hashottuborspa)
cat("hashottuborspa2\n")
heatingorsystemtypeid <- as.factor(heatingorsystemtypeid)
cat("heatingorsystemtypeid\n")
pooltypeid10 <- as.factor(pooltypeid10)
cat("pooltypeid10\n")
pooltypeid2 <- as.factor(pooltypeid2)
cat("pooltypeid2\n")
pooltypeid7 <- as.factor(pooltypeid7)
cat("pooltypeid7\n")
propertylandusetypeid <- as.factor(propertylandusetypeid)
cat("propertylandusetypeid\n")
rawcensustractandblock <- as.factor(rawcensustractandblock)
cat("rawcensustractandblock\n")
regionidcity <- as.factor(regionidcity)
cat("regionidcity\n")
regionidcounty <- as.factor(regionidcounty)
cat("regionidcounty\n")
regionidneighborhood <- as.factor(regionidneighborhood)
cat("regionidneighborhood\n")
regionidzip <- as.factor(zipcode::clean.zipcodes(regionidzip))
cat("regionidzip\n")
storytypeid <- as.factor(storytypeid)
cat("storytypeid\n")
typeconstructiontypeid <- as.factor(typeconstructiontypeid)
cat("typeconstructiontypeid\n")
fireplaceflag <- as.character(fireplaceflag)
cat("fireplaceflag0\n")
fireplaceflag[fireplaceflag == ""] <- "false"
cat("fireplaceflag1\n")
fireplaceflag <- as.factor(fireplaceflag)
cat("fireplaceflag2\n")
taxdelinquencyflag <- as.character(taxdelinquencyflag)
cat("taxdelinquencyflag0\n")
taxdelinquencyflag[taxdelinquencyflag == ""] <- "false"
cat("taxdelinquencyflag1\n")
taxdelinquencyflag <- as.factor(taxdelinquencyflag)
cat("taxdelinquencyflag2\n")
censustractandblock <- as.factor(censustractandblock)
cat("censustractandblock\n")
})
## parcelid
## airconditiontypeid
## buildingclasstypeid
## buildingqualitytypeid
## decktypeid
## fips
## hashottuborspa0
## hashottuborspa1
## hashottuborspa2
## heatingorsystemtypeid
## pooltypeid10
## pooltypeid2
## pooltypeid7
## propertylandusetypeid
## rawcensustractandblock
## regionidcity
## regionidcounty
## regionidneighborhood
## regionidzip
## storytypeid
## typeconstructiontypeid
## fireplaceflag0
## fireplaceflag1
## fireplaceflag2
## taxdelinquencyflag0
## taxdelinquencyflag1
## taxdelinquencyflag2
## censustractandblock
# Fix Latitude and Longitude by division
properties$latitude <- properties$latitude/1e+06
properties$longitude <- properties$longitude/1e+06
### Merged
# Merge Datasets
properties_complete <- merge(properties, train, by.x = "parcelid", by.y = "parcelid",
all.x = TRUE)
train_complete <- merge(properties, train, by.x = "parcelid", by.y = "parcelid")
######################## Exploratory Analysis
## Examine Train Set
# look at error distribution
summary(train$logerror)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.60500 -0.02630 0.00500 0.01077 0.03830 4.73700
ggplot(data = train, aes(x = logerror)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# look at time of sales
ggplot(data = train, aes(x = transactiondate)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# look at time of sales vs error
ggplot(data = train, aes(x = transactiondate, y = logerror)) + geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam'
## Examine Missing Values
# First look at how many NAs can be in a row, and where the transactions are
emptyrows <- data.frame(empties = rowSums(is.na(properties)), sold = !is.na(properties_complete$logerror),
margin = properties_complete$logerror)
ggplot(data = emptyrows, aes(x = empties, fill = sold)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Look at which values are unfilled
empty_columns <- data.frame(column_name = colnames(properties), notempties = colSums(!is.na(properties)))
ggplot(empty_columns, aes(x = reorder(column_name, notempties), y = notempties,
fill = notempties)) + geom_bar(stat = "identity") + coord_flip()
# Look at only transactions
emptyrows %>% filter(!is.na(margin)) %>% ggplot(aes(x = empties)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Look at if # of NAs impact the log errors
emptyrows %>% filter(!is.na(margin)) %>% ggplot(aes(x = empties, y = margin)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'
# Look at absolute log errors against missing values
emptyrows %>% filter(!is.na(margin)) %>% ggplot(aes(x = empties, y = abs(margin))) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'
## Understand how Properties Build Across Periods Change
# Most are recently built
ggplot(properties, aes(x = yearbuilt)) + geom_histogram() + theme(axis.text.x = element_text(angle = 90,
hjust = 1)) + ggtitle("building across time built")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 59928 rows containing non-finite values (stat_bin).
# Interesting - NAs have higher marginal error than dated... important value
# to impute
ggplot(train_complete, aes(x = as.factor(yearbuilt), y = abs(logerror))) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("error across time built")
# Correlate year build with all other numerics
# Map yearbuilt
bbox2 <- make_bbox(properties_complete$longitude, properties_complete$latitude,
f = 1e-05) # Define Map parameters
map <- openmap(c(bbox2[4], bbox2[1]), c(bbox2[2], bbox2[3]), type = "esri") # Create map
map <- openproj(map) # Call map
p0 <- autoplot(map) # Plot Points
p <- p0 + geom_point(aes(x = longitude, y = latitude, col = yearbuilt), data = properties_complete,
alpha = 0.8, size = 0.5) + ggtitle("Properties by Year Built") + scale_colour_distiller(palette = "Spectral")
p
## Warning: Removed 11437 rows containing missing values (geom_point).
######################## Modeling and Feature Engineering
### Establish a train set and cross validation set
set.seed(232)
trainIndex = createDataPartition(train_complete$logerror, p = 0.75, list = FALSE)
training = train_complete[trainIndex, ]
crossval = train_complete[-trainIndex, ]
### True Baseline - Take average of all train logs and use as prediction
train_meanlogerror <- mean(training$logerror)
cat("Root Mean Square Error:/n")
## Root Mean Square Error:/n
cat(sqrt(mean((train_meanlogerror - crossval$logerror)^2)))
## 0.1618654
### Set cross validation standard for feature engineering to a 5-fold cross
### validation
fitControl <- trainControl(method = "cv", number = 5)
### Baseline First, model on the most complete variables (15 factor levels or
### fewer, > 0 variance) to establish a baseline
variables <- c("logerror", as.character(empty_columns[empty_columns$notempties >
2900000, "column_name"])) # Get mostly complete data
variables <- variables[c(1, 3:5, 7:9, 11, 17:21, 23:25)] # Remove factors with too many levels (15 or more) or too little (no variance)
training0 <- training[, variables]
training0 <- training0[complete.cases(training0), ]
set.seed(232)
fit_baseline <- train(logerror ~ ., method = "gbm", data = training0, trControl = fitControl,
tuneLength = 3, verbose = FALSE)
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.3.3
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.3.3
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
##
## here
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 7: propertylandusetypeid47 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 7: propertylandusetypeid47 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 7: propertylandusetypeid47 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 3, 4, 2, 2, 2, 3, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 3, 4, 2, 2, 2, 3, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 3, 4, 2, 2, 2, 3, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 4, 3, 2, 2, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 4, 3, 2, 2, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 4, 3, 2, 2, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
fit_baseline
## Stochastic Gradient Boosting
##
## 66990 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 53592, 53592, 53592, 53591, 53593
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared
## 1 50 0.1588899 0.006560208
## 1 100 0.1588946 0.006806115
## 1 150 0.1588824 0.006553156
## 2 50 0.1587691 0.007412053
## 2 100 0.1587571 0.007412911
## 2 150 0.1587916 0.007372143
## 3 50 0.1587631 0.007446762
## 3 100 0.1587458 0.007672278
## 3 150 0.1588137 0.007496214
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 100,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
plot(fit_baseline)
complete_crossval <- crossval[complete.cases(crossval[, variables]), ]
predictions <- predict(fit_baseline, complete_crossval)
results <- data.frame(prediction = predictions, actual = complete_crossval$logerror)
ggplot(results, aes(x = actual, y = prediction, col = (prediction - actual)^2)) +
geom_point()
cat("Root Mean Square Error:/n")
## Root Mean Square Error:/n
cat(sqrt(mean((results$prediction - results$actual)^2)))
## 0.1589634
varImp(fit_baseline)
## gbm variable importance
##
## only 20 most important variables shown (out of 28)
##
## Overall
## calculatedfinishedsquarefeet 100.0000
## latitude 50.2352
## structuretaxvaluedollarcnt 28.7231
## taxvaluedollarcnt 24.5112
## taxamount 22.0651
## landtaxvaluedollarcnt 13.6312
## longitude 9.8606
## bathroomcnt 8.3165
## yearbuilt 6.1897
## hashottuborspatrue 2.1677
## bedroomcnt 1.6773
## taxdelinquencyflagY 1.2854
## propertylandusetypeid263 0.6949
## propertylandusetypeid266 0.6364
## propertylandusetypeid246 0.3537
## propertylandusetypeid247 0.3399
## propertylandusetypeid275 0.0000
## propertylandusetypeid260 0.0000
## propertylandusetypeid47 0.0000
## roomcnt 0.0000
### Predict solely on NA or not NA
training0 <- train_complete[, 2:59]
training0[, 1:57] <- is.na(training0[, 1:57])
set.seed(232)
fit_na <- train(logerror ~ ., method = "gbm", data = training0, trControl = fitControl,
tuneLength = 3, verbose = FALSE)
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
fit_na
## Stochastic Gradient Boosting
##
## 90682 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 72546, 72546, 72544, 72546, 72546
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared
## 1 50 0.1628442 0.003689136
## 1 100 0.1628311 0.003841328
## 1 150 0.1628263 0.003944107
## 2 50 0.1628346 0.003881268
## 2 100 0.1628202 0.004107002
## 2 150 0.1628164 0.004237280
## 3 50 0.1628201 0.004126513
## 3 100 0.1628059 0.004301384
## 3 150 0.1628019 0.004454553
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
plot(fit_na)
complete_crossval <- crossval[, 2:59]
complete_crossval[, 1:57] <- is.na(complete_crossval[, 1:57])
predictions <- predict(fit_na, complete_crossval)
results <- data.frame(prediction = predictions, actual = complete_crossval$logerror)
ggplot(results, aes(x = actual, y = prediction, col = (prediction - actual)^2)) +
geom_point()
cat("Root Mean Square Error:/n")
## Root Mean Square Error:/n
cat(sqrt(mean((results$prediction - results$actual)^2)))
## 0.1612399
varImp(fit_na)
## gbm variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## structuretaxvaluedollarcntTRUE 100.000
## regionidneighborhoodTRUE 70.422
## taxvaluedollarcntTRUE 66.104
## calculatedbathnbrTRUE 64.797
## regionidzipTRUE 58.075
## censustractandblockTRUE 50.285
## buildingclasstypeidTRUE 46.815
## airconditioningtypeidTRUE 41.452
## finishedsquarefeet12TRUE 32.495
## taxdelinquencyyearTRUE 30.457
## buildingqualitytypeidTRUE 29.946
## yearbuiltTRUE 23.796
## heatingorsystemtypeidTRUE 22.084
## finishedsquarefeet6TRUE 21.478
## finishedsquarefeet15TRUE 18.198
## lotsizesquarefeetTRUE 15.984
## calculatedfinishedsquarefeetTRUE 15.742
## numberofstoriesTRUE 14.073
## unitcntTRUE 11.915
## bathroomcntTRUE 9.616
### Predict on NA or not NA & scaled numerical data
### Predict on Imputed Features
### Predict on feature abberation from nearest surrounding units
### Predict on Imputed feature abberation from nearest surrounding units
######################## Validation Testing
######################## Parameter Tuning