Introduction

This is a guide through my exploration of the Kaggle Zillow Round One dataset. This is my first kaggle competition.

I will do a thorough write-up of my exploration and predictive modeling, following the completion of Round 1.

Appendix

Code

######################## Set Environment

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggmap)
library(OpenStreetMap)
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
######################## Pre Process

# Import train data
train <- read.csv("Data/train_2016.csv")
# Reclassify
train$parcelid <- as.character(train$parcelid)  # Convert to character
train$transactiondate <- ymd(train$transactiondate)  # Make class date

# How many times same parcel is sold more than once
train %>% group_by(parcelid) %>% summarise(freq = n()) %>% filter(freq > 1) %>% 
    summarise(total = n())
## Warning: package 'bindrcpp' was built under R version 3.3.3
## # A tibble: 1 x 1
##   total
##   <int>
## 1   128
# Remove repeated transactions
train$rpt <- ave(train$parcelid == train$parcelid, train$parcelid, FUN = cumsum)
train <- train[train$rpt == 1, ]
train$rpt <- NULL

### Property Data

# Import property data
properties <- read.csv("Data/properties_2016.csv")
summary(properties)
##     parcelid         airconditioningtypeid architecturalstyletypeid
##  Min.   : 10711725   Min.   : 1.0          Min.   : 2.0            
##  1st Qu.: 11643707   1st Qu.: 1.0          1st Qu.: 7.0            
##  Median : 12545094   Median : 1.0          Median : 7.0            
##  Mean   : 13325858   Mean   : 1.9          Mean   : 7.2            
##  3rd Qu.: 14097122   3rd Qu.: 1.0          3rd Qu.: 7.0            
##  Max.   :169601949   Max.   :13.0          Max.   :27.0            
##                      NA's   :2173698       NA's   :2979156         
##   basementsqft      bathroomcnt       bedroomcnt     buildingclasstypeid
##  Min.   :  20.0    Min.   : 0.000   Min.   : 0.000   Min.   :1.0        
##  1st Qu.: 272.0    1st Qu.: 2.000   1st Qu.: 2.000   1st Qu.:3.0        
##  Median : 534.0    Median : 2.000   Median : 3.000   Median :4.0        
##  Mean   : 646.9    Mean   : 2.209   Mean   : 3.089   Mean   :3.7        
##  3rd Qu.: 847.2    3rd Qu.: 3.000   3rd Qu.: 4.000   3rd Qu.:4.0        
##  Max.   :8516.0    Max.   :20.000   Max.   :20.000   Max.   :5.0        
##  NA's   :2983589   NA's   :11462    NA's   :11450    NA's   :2972588    
##  buildingqualitytypeid calculatedbathnbr   decktypeid     
##  Min.   : 1.0          Min.   : 1.0      Min.   :66       
##  1st Qu.: 4.0          1st Qu.: 2.0      1st Qu.:66       
##  Median : 7.0          Median : 2.0      Median :66       
##  Mean   : 5.8          Mean   : 2.3      Mean   :66       
##  3rd Qu.: 7.0          3rd Qu.: 3.0      3rd Qu.:66       
##  Max.   :12.0          Max.   :20.0      Max.   :66       
##  NA's   :1046729       NA's   :128912    NA's   :2968121  
##  finishedfloor1squarefeet calculatedfinishedsquarefeet
##  Min.   :    3            Min.   :     1              
##  1st Qu.: 1012            1st Qu.:  1213              
##  Median : 1283            Median :  1572              
##  Mean   : 1381            Mean   :  1827              
##  3rd Qu.: 1615            3rd Qu.:  2136              
##  Max.   :31303            Max.   :952576              
##  NA's   :2782500          NA's   :55565               
##  finishedsquarefeet12 finishedsquarefeet13 finishedsquarefeet15
##  Min.   :     1       Min.   : 120         Min.   :   112      
##  1st Qu.:  1196       1st Qu.: 960         1st Qu.:  1694      
##  Median :  1539       Median :1296         Median :  2172      
##  Mean   :  1760       Mean   :1179         Mean   :  2739      
##  3rd Qu.:  2070       3rd Qu.:1440         3rd Qu.:  2976      
##  Max.   :290345       Max.   :2688         Max.   :820242      
##  NA's   :276033       NA's   :2977545      NA's   :2794419     
##  finishedsquarefeet50 finishedsquarefeet6      fips      
##  Min.   :    3        Min.   :   117      Min.   :6037   
##  1st Qu.: 1013        1st Qu.:  1079      1st Qu.:6037   
##  Median : 1284        Median :  1992      Median :6037   
##  Mean   : 1389        Mean   :  2414      Mean   :6048   
##  3rd Qu.: 1618        3rd Qu.:  3366      3rd Qu.:6059   
##  Max.   :31303        Max.   :952576      Max.   :6111   
##  NA's   :2782500      NA's   :2963216     NA's   :11437  
##   fireplacecnt      fullbathcnt      garagecarcnt     garagetotalsqft  
##  Min.   :1.0       Min.   : 1.00    Min.   : 0.0      Min.   :   0.0   
##  1st Qu.:1.0       1st Qu.: 2.00    1st Qu.: 2.0      1st Qu.: 324.0   
##  Median :1.0       Median : 2.00    Median : 2.0      Median : 441.0   
##  Mean   :1.2       Mean   : 2.24    Mean   : 1.8      Mean   : 383.8   
##  3rd Qu.:1.0       3rd Qu.: 3.00    3rd Qu.: 2.0      3rd Qu.: 494.0   
##  Max.   :9.0       Max.   :20.00    Max.   :25.0      Max.   :7749.0   
##  NA's   :2672580   NA's   :128912   NA's   :2101950   NA's   :2101950  
##  hashottuborspa heatingorsystemtypeid    latitude       
##      :2916203   Min.   : 1            Min.   :33324388  
##  true:  69014   1st Qu.: 2            1st Qu.:33827685  
##                 Median : 2            Median :34008249  
##                 Mean   : 4            Mean   :34001469  
##                 3rd Qu.: 7            3rd Qu.:34161860  
##                 Max.   :24            Max.   :34819650  
##                 NA's   :1178816       NA's   :11437     
##    longitude          lotsizesquarefeet      poolcnt       
##  Min.   :-119475780   Min.   :      100   Min.   :1        
##  1st Qu.:-118392983   1st Qu.:     5688   1st Qu.:1        
##  Median :-118172540   Median :     7000   Median :1        
##  Mean   :-118201934   Mean   :    22823   Mean   :1        
##  3rd Qu.:-117949468   3rd Qu.:     9898   3rd Qu.:1        
##  Max.   :-117554316   Max.   :328263808   Max.   :1        
##  NA's   :11437        NA's   :276099      NA's   :2467683  
##   poolsizesum       pooltypeid10      pooltypeid2       pooltypeid7     
##  Min.   :   19.0   Min.   :1         Min.   :1         Min.   :1        
##  1st Qu.:  430.0   1st Qu.:1         1st Qu.:1         1st Qu.:1        
##  Median :  495.0   Median :1         Median :1         Median :1        
##  Mean   :  519.7   Mean   :1         Mean   :1         Mean   :1        
##  3rd Qu.:  594.0   3rd Qu.:1         3rd Qu.:1         3rd Qu.:1        
##  Max.   :17410.0   Max.   :1         Max.   :1         Max.   :1        
##  NA's   :2957257   NA's   :2948278   NA's   :2953142   NA's   :2499758  
##  propertycountylandusecode propertylandusetypeid propertyzoningdesc
##  0100   :1153896           Min.   : 31                  :1006588   
##  122    : 522145           1st Qu.:261           LAR1   : 275029   
##  0101   : 247494           Median :261           LAR3   :  67105   
##  010C   : 225410           Mean   :260           LARS   :  54859   
##  1111   : 126491           3rd Qu.:261           LBR1N  :  52750   
##  34     : 123249           Max.   :275           LAR2   :  48808   
##  (Other): 586532           NA's   :11437         (Other):1480078   
##  rawcensustractandblock  regionidcity    regionidcounty 
##  Min.   :60371011       Min.   :  3491   Min.   :1286   
##  1st Qu.:60373203       1st Qu.: 12447   1st Qu.:2061   
##  Median :60375712       Median : 25218   Median :3101   
##  Mean   :60483450       Mean   : 34993   Mean   :2570   
##  3rd Qu.:60590423       3rd Qu.: 45457   3rd Qu.:3101   
##  Max.   :61110091       Max.   :396556   Max.   :3101   
##  NA's   :11437          NA's   :62845    NA's   :11437  
##  regionidneighborhood  regionidzip        roomcnt        storytypeid     
##  Min.   :  6952       Min.   : 95982   Min.   : 0.000   Min.   :7        
##  1st Qu.: 46736       1st Qu.: 96180   1st Qu.: 0.000   1st Qu.:7        
##  Median :118920       Median : 96377   Median : 0.000   Median :7        
##  Mean   :193476       Mean   : 96553   Mean   : 1.475   Mean   :7        
##  3rd Qu.:274800       3rd Qu.: 96974   3rd Qu.: 0.000   3rd Qu.:7        
##  Max.   :764167       Max.   :399675   Max.   :96.000   Max.   :7        
##  NA's   :1828815      NA's   :13980    NA's   :11475    NA's   :2983593  
##  threequarterbathnbr typeconstructiontypeid    unitcnt       
##  Min.   :1           Min.   : 4             Min.   :  1.0    
##  1st Qu.:1           1st Qu.: 6             1st Qu.:  1.0    
##  Median :1           Median : 6             Median :  1.0    
##  Mean   :1           Mean   : 6             Mean   :  1.2    
##  3rd Qu.:1           3rd Qu.: 6             3rd Qu.:  1.0    
##  Max.   :7           Max.   :13             Max.   :997.0    
##  NA's   :2673586     NA's   :2978470        NA's   :1007727  
##  yardbuildingsqft17 yardbuildingsqft26   yearbuilt     numberofstories  
##  Min.   :  10.0     Min.   :  10.0     Min.   :1801    Min.   : 1.0     
##  1st Qu.: 190.0     1st Qu.:  96.0     1st Qu.:1950    1st Qu.: 1.0     
##  Median : 270.0     Median : 168.0     Median :1963    Median : 1.0     
##  Mean   : 319.8     Mean   : 278.3     Mean   :1964    Mean   : 1.4     
##  3rd Qu.: 390.0     3rd Qu.: 320.0     3rd Qu.:1981    3rd Qu.: 2.0     
##  Max.   :7983.0     Max.   :6141.0     Max.   :2015    Max.   :41.0     
##  NA's   :2904862    NA's   :2982570    NA's   :59928   NA's   :2303148  
##  fireplaceflag  structuretaxvaluedollarcnt taxvaluedollarcnt  
##      :2980054   Min.   :        1          Min.   :        1  
##  true:   5163   1st Qu.:    74800          1st Qu.:   179675  
##                 Median :   122590          Median :   306086  
##                 Mean   :   170884          Mean   :   420479  
##                 3rd Qu.:   196889          3rd Qu.:   488000  
##                 Max.   :251486000          Max.   :282786000  
##                 NA's   :54982              NA's   :42550      
##  assessmentyear  landtaxvaluedollarcnt   taxamount      
##  Min.   :2000    Min.   :       1      Min.   :      1  
##  1st Qu.:2015    1st Qu.:   74836      1st Qu.:   2461  
##  Median :2015    Median :  167042      Median :   3992  
##  Mean   :2015    Mean   :  252478      Mean   :   5378  
##  3rd Qu.:2015    3rd Qu.:  306918      3rd Qu.:   6201  
##  Max.   :2016    Max.   :90246219      Max.   :3458861  
##  NA's   :11439   NA's   :67733         NA's   :31250    
##  taxdelinquencyflag taxdelinquencyyear censustractandblock 
##   :2928755          Min.   : 0.0       Min.   :-1.000e+00  
##  Y:  56462          1st Qu.:14.0       1st Qu.: 6.037e+13  
##                     Median :14.0       Median : 6.038e+13  
##                     Mean   :13.9       Mean   : 6.048e+13  
##                     3rd Qu.:15.0       3rd Qu.: 6.059e+13  
##                     Max.   :99.0       Max.   : 4.830e+14  
##                     NA's   :2928753    NA's   :75126
# Reclassify
properties <- within(properties, {
    parcelid <- as.character(parcelid)
    cat("parcelid\n")
    airconditioningtypeid <- as.factor(airconditioningtypeid)
    cat("airconditiontypeid\n")
    buildingclasstypeid <- as.factor(buildingclasstypeid)
    cat("buildingclasstypeid\n")
    buildingqualitytypeid <- as.factor(buildingqualitytypeid)
    cat("buildingqualitytypeid\n")
    decktypeid <- as.factor(decktypeid)
    cat("decktypeid\n")
    fips <- as.factor(fips)
    cat("fips\n")
    hashottuborspa <- as.character(hashottuborspa)
    cat("hashottuborspa0\n")
    hashottuborspa[hashottuborspa == ""] <- "false"
    cat("hashottuborspa1\n")
    hashottuborspa <- as.factor(hashottuborspa)
    cat("hashottuborspa2\n")
    heatingorsystemtypeid <- as.factor(heatingorsystemtypeid)
    cat("heatingorsystemtypeid\n")
    pooltypeid10 <- as.factor(pooltypeid10)
    cat("pooltypeid10\n")
    pooltypeid2 <- as.factor(pooltypeid2)
    cat("pooltypeid2\n")
    pooltypeid7 <- as.factor(pooltypeid7)
    cat("pooltypeid7\n")
    propertylandusetypeid <- as.factor(propertylandusetypeid)
    cat("propertylandusetypeid\n")
    rawcensustractandblock <- as.factor(rawcensustractandblock)
    cat("rawcensustractandblock\n")
    regionidcity <- as.factor(regionidcity)
    cat("regionidcity\n")
    regionidcounty <- as.factor(regionidcounty)
    cat("regionidcounty\n")
    regionidneighborhood <- as.factor(regionidneighborhood)
    cat("regionidneighborhood\n")
    regionidzip <- as.factor(zipcode::clean.zipcodes(regionidzip))
    cat("regionidzip\n")
    storytypeid <- as.factor(storytypeid)
    cat("storytypeid\n")
    typeconstructiontypeid <- as.factor(typeconstructiontypeid)
    cat("typeconstructiontypeid\n")
    fireplaceflag <- as.character(fireplaceflag)
    cat("fireplaceflag0\n")
    fireplaceflag[fireplaceflag == ""] <- "false"
    cat("fireplaceflag1\n")
    fireplaceflag <- as.factor(fireplaceflag)
    cat("fireplaceflag2\n")
    taxdelinquencyflag <- as.character(taxdelinquencyflag)
    cat("taxdelinquencyflag0\n")
    taxdelinquencyflag[taxdelinquencyflag == ""] <- "false"
    cat("taxdelinquencyflag1\n")
    taxdelinquencyflag <- as.factor(taxdelinquencyflag)
    cat("taxdelinquencyflag2\n")
    censustractandblock <- as.factor(censustractandblock)
    cat("censustractandblock\n")
})
## parcelid
## airconditiontypeid
## buildingclasstypeid
## buildingqualitytypeid
## decktypeid
## fips
## hashottuborspa0
## hashottuborspa1
## hashottuborspa2
## heatingorsystemtypeid
## pooltypeid10
## pooltypeid2
## pooltypeid7
## propertylandusetypeid
## rawcensustractandblock
## regionidcity
## regionidcounty
## regionidneighborhood
## regionidzip
## storytypeid
## typeconstructiontypeid
## fireplaceflag0
## fireplaceflag1
## fireplaceflag2
## taxdelinquencyflag0
## taxdelinquencyflag1
## taxdelinquencyflag2
## censustractandblock
# Fix Latitude and Longitude by division
properties$latitude <- properties$latitude/1e+06
properties$longitude <- properties$longitude/1e+06

### Merged

# Merge Datasets
properties_complete <- merge(properties, train, by.x = "parcelid", by.y = "parcelid", 
    all.x = TRUE)
train_complete <- merge(properties, train, by.x = "parcelid", by.y = "parcelid")

######################## Exploratory Analysis

## Examine Train Set

# look at error distribution
summary(train$logerror)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -4.60500 -0.02630  0.00500  0.01077  0.03830  4.73700
ggplot(data = train, aes(x = logerror)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# look at time of sales
ggplot(data = train, aes(x = transactiondate)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# look at time of sales vs error
ggplot(data = train, aes(x = transactiondate, y = logerror)) + geom_point() + 
    geom_smooth()
## `geom_smooth()` using method = 'gam'

## Examine Missing Values

# First look at how many NAs can be in a row, and where the transactions are
emptyrows <- data.frame(empties = rowSums(is.na(properties)), sold = !is.na(properties_complete$logerror), 
    margin = properties_complete$logerror)
ggplot(data = emptyrows, aes(x = empties, fill = sold)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Look at which values are unfilled
empty_columns <- data.frame(column_name = colnames(properties), notempties = colSums(!is.na(properties)))

ggplot(empty_columns, aes(x = reorder(column_name, notempties), y = notempties, 
    fill = notempties)) + geom_bar(stat = "identity") + coord_flip()

# Look at only transactions
emptyrows %>% filter(!is.na(margin)) %>% ggplot(aes(x = empties)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Look at if # of NAs impact the log errors
emptyrows %>% filter(!is.na(margin)) %>% ggplot(aes(x = empties, y = margin)) + 
    geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'

# Look at absolute log errors against missing values
emptyrows %>% filter(!is.na(margin)) %>% ggplot(aes(x = empties, y = abs(margin))) + 
    geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'

## Understand how Properties Build Across Periods Change

# Most are recently built
ggplot(properties, aes(x = yearbuilt)) + geom_histogram() + theme(axis.text.x = element_text(angle = 90, 
    hjust = 1)) + ggtitle("building across time built")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 59928 rows containing non-finite values (stat_bin).

# Interesting - NAs have higher marginal error than dated... important value
# to impute
ggplot(train_complete, aes(x = as.factor(yearbuilt), y = abs(logerror))) + geom_boxplot() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("error across time built")

# Correlate year build with all other numerics

# Map yearbuilt
bbox2 <- make_bbox(properties_complete$longitude, properties_complete$latitude, 
    f = 1e-05)  # Define Map parameters
map <- openmap(c(bbox2[4], bbox2[1]), c(bbox2[2], bbox2[3]), type = "esri")  # Create map
map <- openproj(map)  # Call map
p0 <- autoplot(map)  # Plot Points
p <- p0 + geom_point(aes(x = longitude, y = latitude, col = yearbuilt), data = properties_complete, 
    alpha = 0.8, size = 0.5) + ggtitle("Properties by Year Built") + scale_colour_distiller(palette = "Spectral")

p
## Warning: Removed 11437 rows containing missing values (geom_point).

######################## Modeling and Feature Engineering

### Establish a train set and cross validation set
set.seed(232)
trainIndex = createDataPartition(train_complete$logerror, p = 0.75, list = FALSE)
training = train_complete[trainIndex, ]
crossval = train_complete[-trainIndex, ]

### True Baseline - Take average of all train logs and use as prediction
train_meanlogerror <- mean(training$logerror)
cat("Root Mean Square Error:/n")
## Root Mean Square Error:/n
cat(sqrt(mean((train_meanlogerror - crossval$logerror)^2)))
## 0.1618654
### Set cross validation standard for feature engineering to a 5-fold cross
### validation
fitControl <- trainControl(method = "cv", number = 5)

### Baseline First, model on the most complete variables (15 factor levels or
### fewer, > 0 variance) to establish a baseline
variables <- c("logerror", as.character(empty_columns[empty_columns$notempties > 
    2900000, "column_name"]))  # Get mostly complete data
variables <- variables[c(1, 3:5, 7:9, 11, 17:21, 23:25)]  # Remove factors with too many levels (15 or more) or too little (no variance)
training0 <- training[, variables]
training0 <- training0[complete.cases(training0), ]

set.seed(232)
fit_baseline <- train(logerror ~ ., method = "gbm", data = training0, trControl = fitControl, 
    tuneLength = 3, verbose = FALSE)
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.3.3
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.3.3
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
## 
##     here
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 7: propertylandusetypeid47 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 7: propertylandusetypeid47 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 7: propertylandusetypeid47 has no variation.
## Warning in gbm.fit(x = structure(c(2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 3, 4, 2, 2, 2, 3, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 3, 4, 2, 2, 2, 3, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 3, 4, 2, 2, 2, 3, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 4, 3, 2, 2, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 4, 3, 2, 2, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 4, 3, 2, 2, 2, 3, 2, 3, :
## variable 19: propertylandusetypeid270 has no variation.
## Warning in gbm.fit(x = structure(c(2, 2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 3, 2, :
## variable 19: propertylandusetypeid270 has no variation.
fit_baseline
## Stochastic Gradient Boosting 
## 
## 66990 samples
##    15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 53592, 53592, 53592, 53591, 53593 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE       Rsquared   
##   1                   50      0.1588899  0.006560208
##   1                  100      0.1588946  0.006806115
##   1                  150      0.1588824  0.006553156
##   2                   50      0.1587691  0.007412053
##   2                  100      0.1587571  0.007412911
##   2                  150      0.1587916  0.007372143
##   3                   50      0.1587631  0.007446762
##   3                  100      0.1587458  0.007672278
##   3                  150      0.1588137  0.007496214
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were n.trees = 100,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
plot(fit_baseline)

complete_crossval <- crossval[complete.cases(crossval[, variables]), ]
predictions <- predict(fit_baseline, complete_crossval)
results <- data.frame(prediction = predictions, actual = complete_crossval$logerror)
ggplot(results, aes(x = actual, y = prediction, col = (prediction - actual)^2)) + 
    geom_point()

cat("Root Mean Square Error:/n")
## Root Mean Square Error:/n
cat(sqrt(mean((results$prediction - results$actual)^2)))
## 0.1589634
varImp(fit_baseline)
## gbm variable importance
## 
##   only 20 most important variables shown (out of 28)
## 
##                               Overall
## calculatedfinishedsquarefeet 100.0000
## latitude                      50.2352
## structuretaxvaluedollarcnt    28.7231
## taxvaluedollarcnt             24.5112
## taxamount                     22.0651
## landtaxvaluedollarcnt         13.6312
## longitude                      9.8606
## bathroomcnt                    8.3165
## yearbuilt                      6.1897
## hashottuborspatrue             2.1677
## bedroomcnt                     1.6773
## taxdelinquencyflagY            1.2854
## propertylandusetypeid263       0.6949
## propertylandusetypeid266       0.6364
## propertylandusetypeid246       0.3537
## propertylandusetypeid247       0.3399
## propertylandusetypeid275       0.0000
## propertylandusetypeid260       0.0000
## propertylandusetypeid47        0.0000
## roomcnt                        0.0000
### Predict solely on NA or not NA
training0 <- train_complete[, 2:59]
training0[, 1:57] <- is.na(training0[, 1:57])

set.seed(232)
fit_na <- train(logerror ~ ., method = "gbm", data = training0, trControl = fitControl, 
    tuneLength = 3, verbose = FALSE)
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 22: hashottuborspaTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 32: propertycountylandusecodeTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 34: propertyzoningdescTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 49: fireplaceflagTRUE has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, :
## variable 55: taxdelinquencyflagTRUE has no variation.
fit_na
## Stochastic Gradient Boosting 
## 
## 90682 samples
##    57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 72546, 72546, 72544, 72546, 72546 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE       Rsquared   
##   1                   50      0.1628442  0.003689136
##   1                  100      0.1628311  0.003841328
##   1                  150      0.1628263  0.003944107
##   2                   50      0.1628346  0.003881268
##   2                  100      0.1628202  0.004107002
##   2                  150      0.1628164  0.004237280
##   3                   50      0.1628201  0.004126513
##   3                  100      0.1628059  0.004301384
##   3                  150      0.1628019  0.004454553
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
plot(fit_na)

complete_crossval <- crossval[, 2:59]
complete_crossval[, 1:57] <- is.na(complete_crossval[, 1:57])
predictions <- predict(fit_na, complete_crossval)
results <- data.frame(prediction = predictions, actual = complete_crossval$logerror)
ggplot(results, aes(x = actual, y = prediction, col = (prediction - actual)^2)) + 
    geom_point()

cat("Root Mean Square Error:/n")
## Root Mean Square Error:/n
cat(sqrt(mean((results$prediction - results$actual)^2)))
## 0.1612399
varImp(fit_na)
## gbm variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                                  Overall
## structuretaxvaluedollarcntTRUE   100.000
## regionidneighborhoodTRUE          70.422
## taxvaluedollarcntTRUE             66.104
## calculatedbathnbrTRUE             64.797
## regionidzipTRUE                   58.075
## censustractandblockTRUE           50.285
## buildingclasstypeidTRUE           46.815
## airconditioningtypeidTRUE         41.452
## finishedsquarefeet12TRUE          32.495
## taxdelinquencyyearTRUE            30.457
## buildingqualitytypeidTRUE         29.946
## yearbuiltTRUE                     23.796
## heatingorsystemtypeidTRUE         22.084
## finishedsquarefeet6TRUE           21.478
## finishedsquarefeet15TRUE          18.198
## lotsizesquarefeetTRUE             15.984
## calculatedfinishedsquarefeetTRUE  15.742
## numberofstoriesTRUE               14.073
## unitcntTRUE                       11.915
## bathroomcntTRUE                    9.616
### Predict on NA or not NA & scaled numerical data



### Predict on Imputed Features



### Predict on feature abberation from nearest surrounding units



### Predict on Imputed feature abberation from nearest surrounding units


######################## Validation Testing


######################## Parameter Tuning