Start of Code
Load packages
Read in Data
# Read in data
library(readxl)
data_gbm_clean <- read_excel("/Users/sawyerbenson/Documents/Master Thesis/Thesis_Github/Models/Data/New Data/3. data_factor_cleaned.xlsx")
# Structure Change
data_gbm_clean$property_type <- as.factor(data_gbm_clean$property_type)
data_gbm_clean$ac_type <- as.factor(data_gbm_clean$ac_type)
data_gbm_clean$patio <- as.factor(data_gbm_clean$patio)
data_gbm_clean$school_general <- as.factor(data_gbm_clean$school_general)
data_gbm_clean$pool <- as.factor(data_gbm_clean$pool)
data_gbm_clean$roof_type <- as.factor(data_gbm_clean$roof_type)
data_gbm_clean$gas_type <- as.factor(data_gbm_clean$gas_type)
data_gbm_clean$out_building <- as.factor(data_gbm_clean$out_building)
data_gbm_clean$appliances <- as.factor(data_gbm_clean$appliances)
data_gbm_clean$garage <- as.factor(data_gbm_clean$garage)
data_gbm_clean$property_condition <- as.factor(data_gbm_clean$property_condition)
data_gbm_clean$energy_efficient <- as.factor(data_gbm_clean$energy_efficient)
data_gbm_clean$exterior_type <- as.factor(data_gbm_clean$exterior_type)
data_gbm_clean$exterior_features <- as.factor(data_gbm_clean$exterior_features)
data_gbm_clean$fireplace <- as.factor(data_gbm_clean$fireplace)
data_gbm_clean$foundation_type <- as.factor(data_gbm_clean$foundation_type)
data_gbm_clean$beds_total <- as.factor(data_gbm_clean$beds_total)
data_gbm_clean$bath_full <- as.factor(data_gbm_clean$bath_full)
data_gbm_clean$bath_half <- as.factor(data_gbm_clean$bath_half)
data_gbm_clean$sewer_type <- as.factor(data_gbm_clean$sewer_type)
data_gbm_clean$property_style <- as.factor(data_gbm_clean$property_style)
data_gbm_clean$subdivision <- as.factor(data_gbm_clean$subdivision)
data_gbm_clean$water_type <- as.factor(data_gbm_clean$water_type)
data_gbm_clean$waterfront <- as.factor(data_gbm_clean$waterfront)
data_gbm_clean$sold_date <- openxlsx::convertToDate(data_gbm_clean$sold_date)
data_gbm_clean$sold_date <- as.numeric(data_gbm_clean$sold_date)
str(data_gbm_clean)
tibble [24,412 × 48] (S3: tbl_df/tbl/data.frame)
$ mls_number : chr [1:24412] "CNNN5274" "CNNN5241" "CNN104918" "CNN104870" ...
$ property_type : Factor w/ 6 levels "CND","DUP","OTH",..: 5 5 5 5 5 5 5 5 5 5 ...
$ ac_type : Factor w/ 3 levels "central","none",..: 1 3 1 1 1 1 1 1 1 1 ...
$ list_price : num [1:24412] 187000 250000 224900 225000 274900 ...
$ patio : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 2 2 ...
$ school_general : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ photo_count : num [1:24412] 0 0 0 0 25 2 6 17 17 15 ...
$ pool : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 2 1 ...
$ roof_type : Factor w/ 4 levels "metal","other",..: 3 3 2 3 3 3 2 3 2 2 ...
$ gas_type : Factor w/ 5 levels "butane","natural",..: 5 5 5 5 5 5 5 5 5 5 ...
$ out_building : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 2 1 ...
$ area_living : num [1:24412] 2054 2120 2078 1923 2184 ...
$ land_acres : num [1:24412] 0.28 0.4 0.29 0.36 0.82 0.36 1 1.27 0.63 2.01 ...
$ appliances : Factor w/ 2 levels "0","1": 2 1 2 2 2 2 2 2 2 2 ...
$ garage : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 2 2 2 2 ...
$ property_condition : Factor w/ 3 levels "excellent","new",..: 3 3 3 3 3 3 3 3 3 3 ...
$ energy_efficient : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 2 ...
$ exterior_type : Factor w/ 5 levels "brick","metal",..: 3 3 4 4 1 3 4 1 3 3 ...
$ exterior_features : Factor w/ 6 levels "balcony","courtyard",..: 4 4 4 5 5 3 4 5 3 3 ...
$ fireplace : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 2 2 ...
$ foundation_type : Factor w/ 3 levels "raised","slab",..: 1 2 1 2 2 3 2 2 2 2 ...
$ area_total : num [1:24412] 2254 2120 2962 2550 3510 ...
$ beds_total : Factor w/ 7 levels "0","1","2","3",..: 4 5 4 4 4 4 5 4 5 5 ...
$ bath_full : Factor w/ 7 levels "0","1","2","3",..: 3 3 3 3 3 3 3 3 3 3 ...
$ bath_half : Factor w/ 6 levels "0","1","2","3",..: 1 1 1 2 1 1 1 1 2 1 ...
$ age : num [1:24412] 82 9 70 27 7 6 38 32 15 5 ...
$ dom : num [1:24412] 78 83 89 203 231 54 144 108 26 25 ...
$ sold_price : num [1:24412] 169000 245000 230000 220000 272000 ...
$ sold_date : num [1:24412] 16843 17123 17228 17336 17324 ...
$ sewer_type : Factor w/ 3 levels "city","septic",..: 1 1 1 2 2 1 3 3 1 3 ...
$ property_style : Factor w/ 2 levels "mobile","not_mobile": 2 2 2 2 2 2 2 2 2 2 ...
$ city_limits : num [1:24412] 1 1 1 1 1 1 1 1 1 1 ...
$ subdivision : Factor w/ 2 levels "0","1": 2 1 2 2 2 2 2 2 2 2 ...
$ water_type : Factor w/ 2 levels "public","well": 1 1 1 1 1 1 1 1 1 1 ...
$ waterfront : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
$ infections_daily : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ infections_accum : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ corona_date_split : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ infections_3mma : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ top25_sold_price : num [1:24412] 0 1 0 0 1 1 0 1 1 1 ...
$ top50_sold_price : num [1:24412] 0 1 1 1 1 1 1 1 1 1 ...
$ bottom25_sold_price : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ top25_area_living : num [1:24412] 1 1 1 0 1 1 1 1 1 0 ...
$ bottom25_area_living: num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ top25_age : num [1:24412] 1 0 1 0 0 0 0 0 0 0 ...
$ bottom25_age : num [1:24412] 0 1 0 0 1 1 0 0 1 1 ...
$ top25_dom : num [1:24412] 0 0 0 1 1 0 1 0 0 0 ...
$ bottom25_dom : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
# Splits
data_gbm_clean$city_limits <- as.factor(data_gbm_clean$city_limits)
data_gbm_clean$corona_date_split <- as.factor(data_gbm_clean$corona_date_split)
data_gbm_clean$top25_sold_price <- as.factor(data_gbm_clean$top25_sold_price)
data_gbm_clean$bottom25_sold_price <- as.factor(data_gbm_clean$bottom25_sold_price)
data_gbm_clean$top25_area_living <- as.factor(data_gbm_clean$top25_area_living)
data_gbm_clean$bottom25_area_living <- as.factor(data_gbm_clean$bottom25_area_living)
data_gbm_clean$top25_age <- as.factor(data_gbm_clean$top25_age)
data_gbm_clean$bottom25_age <- as.factor(data_gbm_clean$bottom25_age)
data_gbm_clean$top25_dom <- as.factor(data_gbm_clean$top25_dom)
data_gbm_clean$bottom25_dom <- as.factor(data_gbm_clean$bottom25_dom)
data_gbm_clean$infections_period <- as.numeric(data_gbm_clean$infections_accum > 1000)
data_gbm_clean$infections_period <- as.factor(data_gbm_clean$infections_period)
str(data_gbm_clean)
tibble [24,412 × 49] (S3: tbl_df/tbl/data.frame)
$ mls_number : chr [1:24412] "CNNN5274" "CNNN5241" "CNN104918" "CNN104870" ...
$ property_type : Factor w/ 6 levels "CND","DUP","OTH",..: 5 5 5 5 5 5 5 5 5 5 ...
$ ac_type : Factor w/ 3 levels "central","none",..: 1 3 1 1 1 1 1 1 1 1 ...
$ list_price : num [1:24412] 187000 250000 224900 225000 274900 ...
$ patio : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 2 2 ...
$ school_general : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ photo_count : num [1:24412] 0 0 0 0 25 2 6 17 17 15 ...
$ pool : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 2 1 ...
$ roof_type : Factor w/ 4 levels "metal","other",..: 3 3 2 3 3 3 2 3 2 2 ...
$ gas_type : Factor w/ 5 levels "butane","natural",..: 5 5 5 5 5 5 5 5 5 5 ...
$ out_building : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 2 1 ...
$ area_living : num [1:24412] 2054 2120 2078 1923 2184 ...
$ land_acres : num [1:24412] 0.28 0.4 0.29 0.36 0.82 0.36 1 1.27 0.63 2.01 ...
$ appliances : Factor w/ 2 levels "0","1": 2 1 2 2 2 2 2 2 2 2 ...
$ garage : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 2 2 2 2 ...
$ property_condition : Factor w/ 3 levels "excellent","new",..: 3 3 3 3 3 3 3 3 3 3 ...
$ energy_efficient : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 2 ...
$ exterior_type : Factor w/ 5 levels "brick","metal",..: 3 3 4 4 1 3 4 1 3 3 ...
$ exterior_features : Factor w/ 6 levels "balcony","courtyard",..: 4 4 4 5 5 3 4 5 3 3 ...
$ fireplace : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 2 2 ...
$ foundation_type : Factor w/ 3 levels "raised","slab",..: 1 2 1 2 2 3 2 2 2 2 ...
$ area_total : num [1:24412] 2254 2120 2962 2550 3510 ...
$ beds_total : Factor w/ 7 levels "0","1","2","3",..: 4 5 4 4 4 4 5 4 5 5 ...
$ bath_full : Factor w/ 7 levels "0","1","2","3",..: 3 3 3 3 3 3 3 3 3 3 ...
$ bath_half : Factor w/ 6 levels "0","1","2","3",..: 1 1 1 2 1 1 1 1 2 1 ...
$ age : num [1:24412] 82 9 70 27 7 6 38 32 15 5 ...
$ dom : num [1:24412] 78 83 89 203 231 54 144 108 26 25 ...
$ sold_price : num [1:24412] 169000 245000 230000 220000 272000 ...
$ sold_date : num [1:24412] 16843 17123 17228 17336 17324 ...
$ sewer_type : Factor w/ 3 levels "city","septic",..: 1 1 1 2 2 1 3 3 1 3 ...
$ property_style : Factor w/ 2 levels "mobile","not_mobile": 2 2 2 2 2 2 2 2 2 2 ...
$ city_limits : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
$ subdivision : Factor w/ 2 levels "0","1": 2 1 2 2 2 2 2 2 2 2 ...
$ water_type : Factor w/ 2 levels "public","well": 1 1 1 1 1 1 1 1 1 1 ...
$ waterfront : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
$ infections_daily : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ infections_accum : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ corona_date_split : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ infections_3mma : num [1:24412] 0 0 0 0 0 0 0 0 0 0 ...
$ top25_sold_price : Factor w/ 2 levels "0","1": 1 2 1 1 2 2 1 2 2 2 ...
$ top50_sold_price : num [1:24412] 0 1 1 1 1 1 1 1 1 1 ...
$ bottom25_sold_price : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ top25_area_living : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 2 1 ...
$ bottom25_area_living: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ top25_age : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 1 1 ...
$ bottom25_age : Factor w/ 2 levels "0","1": 1 2 1 1 2 2 1 1 2 2 ...
$ top25_dom : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 1 1 1 ...
$ bottom25_dom : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ infections_period : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
# Remove this weird '20' level is bath_full
levels(data_gbm_clean$bath_full)
[1] "0" "1" "2" "3" "4" "6" "20"
is.na(data_gbm_clean$bath_full) <- data_gbm_clean$bath_full == "20"
data_gbm_clean$bath_full <- factor(data_gbm_clean$bath_full)
levels(data_gbm_clean$bath_full)
[1] "0" "1" "2" "3" "4" "6"
# Remove beds_total > 5
levels(data_gbm_clean$beds_total)
[1] "0" "1" "2" "3" "4" "5" "6"
is.na(data_gbm_clean$beds_total) <- data_gbm_clean$beds_total == "7"
data_gbm_clean$beds_total <- factor(data_gbm_clean$beds_total)
is.na(data_gbm_clean$beds_total) <- data_gbm_clean$beds_total == "6"
data_gbm_clean$beds_total <- factor(data_gbm_clean$beds_total)
# Non_linear Additions
data_gbm_clean$age_2 <- I(data_gbm_clean$age^2)
data_gbm_clean$area_living_2 <- I(data_gbm_clean$area_living^2)
# Removals
data_gbm_clean <- subset(data_gbm_clean, select = -c(area_total, list_price))
names(data_gbm_clean)
[1] "mls_number" "property_type" "ac_type" "patio"
[5] "school_general" "photo_count" "pool" "roof_type"
[9] "gas_type" "out_building" "area_living" "land_acres"
[13] "appliances" "garage" "property_condition" "energy_efficient"
[17] "exterior_type" "exterior_features" "fireplace" "foundation_type"
[21] "beds_total" "bath_full" "bath_half" "age"
[25] "dom" "sold_price" "sold_date" "sewer_type"
[29] "property_style" "city_limits" "subdivision" "water_type"
[33] "waterfront" "infections_daily" "infections_accum" "corona_date_split"
[37] "infections_3mma" "top25_sold_price" "top50_sold_price" "bottom25_sold_price"
[41] "top25_area_living" "bottom25_area_living" "top25_age" "bottom25_age"
[45] "top25_dom" "bottom25_dom" "infections_period" "age_2"
[49] "area_living_2"
# Remove other currently non-relevant variables
data_gbm_clean <- subset(data_gbm_clean, select = -c(mls_number, infections_accum, corona_date_split, top25_sold_price,
top50_sold_price, bottom25_sold_price,infections_period, infections_daily))
names(data_gbm_clean)
[1] "property_type" "ac_type" "patio" "school_general"
[5] "photo_count" "pool" "roof_type" "gas_type"
[9] "out_building" "area_living" "land_acres" "appliances"
[13] "garage" "property_condition" "energy_efficient" "exterior_type"
[17] "exterior_features" "fireplace" "foundation_type" "beds_total"
[21] "bath_full" "bath_half" "age" "dom"
[25] "sold_price" "sold_date" "sewer_type" "property_style"
[29] "city_limits" "subdivision" "water_type" "waterfront"
[33] "infections_3mma" "top25_area_living" "bottom25_area_living" "top25_age"
[37] "bottom25_age" "top25_dom" "bottom25_dom" "age_2"
[41] "area_living_2"
# Create training (70%) and test (30%) sets for data.
# Use set.seed for reproducibility
set.seed(1)
split <- initial_split(data_gbm_clean, prop = .7)
train <- training(split)
test <- testing(split)
names(train)
[1] "property_type" "ac_type" "patio" "school_general"
[5] "photo_count" "pool" "roof_type" "gas_type"
[9] "out_building" "area_living" "land_acres" "appliances"
[13] "garage" "property_condition" "energy_efficient" "exterior_type"
[17] "exterior_features" "fireplace" "foundation_type" "beds_total"
[21] "bath_full" "bath_half" "age" "dom"
[25] "sold_price" "sold_date" "sewer_type" "property_style"
[29] "city_limits" "subdivision" "water_type" "waterfront"
[33] "infections_3mma" "top25_area_living" "bottom25_area_living" "top25_age"
[37] "bottom25_age" "top25_dom" "bottom25_dom" "age_2"
[41] "area_living_2"
# variable names
features <- setdiff(names(train), "sold_price")
features
[1] "property_type" "ac_type" "patio" "school_general"
[5] "photo_count" "pool" "roof_type" "gas_type"
[9] "out_building" "area_living" "land_acres" "appliances"
[13] "garage" "property_condition" "energy_efficient" "exterior_type"
[17] "exterior_features" "fireplace" "foundation_type" "beds_total"
[21] "bath_full" "bath_half" "age" "dom"
[25] "sold_date" "sewer_type" "property_style" "city_limits"
[29] "subdivision" "water_type" "waterfront" "infections_3mma"
[33] "top25_area_living" "bottom25_area_living" "top25_age" "bottom25_age"
[37] "top25_dom" "bottom25_dom" "age_2" "area_living_2"
# Create the treatment plan from the training data
treatplan <- vtreat::designTreatmentsZ(train, features, verbose = FALSE)
# Get the "clean" variable names from the scoreFrame
new_vars <- treatplan %>%
magrittr::use_series(scoreFrame) %>%
dplyr::filter(code %in% c("clean", "lev")) %>%
magrittr::use_series(varName)
# Prepare the training data
features_train <- vtreat::prepare(treatplan, train, varRestriction = new_vars) %>% as.matrix()
response_train <- train$sold_price
# Prepare the test data
features_test <- vtreat::prepare(treatplan, test, varRestriction = new_vars) %>% as.matrix()
response_test <- test$sold_price
# dimensions of one-hot encoded data
dim(features_train)
[1] 17088 102
dim(features_test)
[1] 7324 102
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
# reproducibility
set.seed(1)
start_time <- Sys.time()
xgb.fit1 <- xgb.cv(
data = features_train,
label = response_train,
nrounds = 1000,
nfold = 5,
objective = "reg:linear", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)
[11:27:05] WARNING: amalgamation/../src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror.
[11:27:05] WARNING: amalgamation/../src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror.
[11:27:05] WARNING: amalgamation/../src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror.
[11:27:05] WARNING: amalgamation/../src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror.
[11:27:05] WARNING: amalgamation/../src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror.
end_time <- Sys.time()
time_taken <- end_time - start_time
time_taken
Time difference of 19.11896 secs
# get number of trees that minimize error
xgb.fit1$evaluation_log %>%
dplyr::summarise(
ntrees.train = which(train_rmse_mean == min(train_rmse_mean))[1],
rmse.train = min(train_rmse_mean),
ntrees.test = which(test_rmse_mean == min(test_rmse_mean))[1],
rmse.test = min(test_rmse_mean),
)
# plot error vs number trees
ggplot(xgb.fit1$evaluation_log) +
geom_line(aes(iter, train_rmse_mean), color = "red") +
geom_line(aes(iter, test_rmse_mean), color = "blue")
stopCluster(cl)
# create hyperparameter grid
hyper_grid <- expand.grid(
eta = c(.01, .05, .1, .3),
max_depth = c(1, 3, 5, 7),
min_child_weight = c(1, 3, 5, 7),
subsample = c(.65, .8, 1),
colsample_bytree = c(.8, .9, 1),
optimal_trees = 0, # a place to dump results
min_RMSE = 0 # a place to dump results
)
nrow(hyper_grid)
## [1] 576
# Manual Grid Search
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
for(i in 1:nrow(hyper_grid)) {
# create parameter list
params <- list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i],
min_child_weight = hyper_grid$min_child_weight[i],
subsample = hyper_grid$subsample[i],
colsample_bytree = hyper_grid$colsample_bytree[i]
)
# reproducibility
set.seed(007)
# train model
xgb.tune <- xgb.cv(
params = params,
data = features_train,
label = response_train,
nrounds = 5000,
nfold = 5,
objective = "reg:linear", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)
# add min training error and trees to grid
hyper_grid$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
hyper_grid$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
}
hyper_grid %>%
dplyr::arrange(min_RMSE) %>%
head(10)
stopCluster(cl)
# Order: rank, eta, max_depth, min_child_weight, subsample, colsample_bytree,
# optimal_trees, min_RMSE
# 1 0.01 7 1 0.65 0.9 1827 41952.29
# 2 0.01 7 5 0.65 0.9 1747 41967.81
# 3 0.01 7 3 0.65 1.0 1695 42002.20
# 4 0.01 7 1 0.65 1.0 1670 42004.94
# 5 0.01 7 3 0.65 0.8 1583 42008.42
# 6 0.01 7 7 0.65 0.9 1607 42016.38
# 7 0.01 7 5 0.65 1.0 1556 42029.93
# 8 0.01 7 5 0.65 0.8 1479 42038.12
# 9 0.01 7 7 0.65 1.0 1555 42045.10
#10 0.01 7 7 0.65 0.8 1478 42065.54
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
start_time <- proc.time()
# parameter list
params <- list(
eta = 0.01,
max_depth = 7,
min_child_weight = 1,
subsample = 0.65,
colsample_bytree = 0.9
)
# train final model
xgb.fit.final <- xgboost(
params = params,
data = features_train,
label = response_train,
nrounds = 1827,
objective = "reg:linear",
verbose = 0
)
[16:36:13] WARNING: amalgamation/../src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror.
end_time <- proc.time()
time_taken <- end_time - start_time
time_taken
user system elapsed
168.883 1.649 173.325
stopCluster(cl)
# Test Error of final Model
# predict values for test data
pred <- predict(xgb.fit.final, features_test)
# results
caret::RMSE(pred, response_test)
[1] 43140.75
## [1] 21319.4
# plot error vs number trees
ggplot(xgb.fit.final$evaluation_log) +
geom_line(aes(iter, train_rmse), color = "red")
# plot error vs number trees
ggplot(xgb.fit.final$evaluation_log) +
geom_line(aes(iter, train_rmse), color = "red")
NA
NA
vip::vip(xgb.fit.final)
# create importance matrix
importance_matrix <- xgb.importance(model = xgb.fit.final)
# variable importance plot
xgb.plot.importance(importance_matrix, top_n = 20, measure = "Gain")
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
pdp <- xgb.fit.final %>%
partial(pred.var = "infections_3mma", n.trees = 1827, grid.resolution = 100, train = features_train) %>%
autoplot(rug = TRUE, train = features_train) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("PDP")
ice <- xgb.fit.final %>%
partial(pred.var = "infections_3mma", n.trees = 1827, grid.resolution = 100, train = features_train, ice = TRUE) %>%
autoplot(rug = TRUE, train = features_train, alpha = .1, center = TRUE) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("ICE")
Warning: `fun.y` is deprecated. Use `fun` instead.
Warning: Ignoring unknown parameters: csides
gridExtra::grid.arrange(pdp, ice, nrow = 2)
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat.id"]]` is discouraged. Use `.data[["yhat.id"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
stopCluster(cl)
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
pdp <- xgb.fit.final %>%
partial(pred.var = "area_living", n.trees = 1827, grid.resolution = 100, train = features_train) %>%
autoplot(rug = TRUE, train = features_train) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("PDP")
ice <- xgb.fit.final %>%
partial(pred.var = "area_living", n.trees = 1827, grid.resolution = 100, train = features_train, ice = TRUE) %>%
autoplot(rug = TRUE, train = features_train, alpha = .1, center = TRUE) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("ICE")
Warning: `fun.y` is deprecated. Use `fun` instead.
Warning: Ignoring unknown parameters: csides
gridExtra::grid.arrange(pdp, ice, nrow = 2)
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat.id"]]` is discouraged. Use `.data[["yhat.id"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
stopCluster(cl)
gridExtra::grid.arrange(pdp_heat_con, nrow = 1)
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[[2L]]` is discouraged. Use `.data[[2L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[[2L]]` is discouraged. Use `.data[[2L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
pdp_3d
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
pdp <- xgb.fit.final %>%
partial(pred.var = "dom", n.trees = 1827, grid.resolution = 100, train = features_train) %>%
autoplot(rug = TRUE, train = features_train) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("PDP")
ice <- xgb.fit.final %>%
partial(pred.var = "dom", n.trees = 1827, grid.resolution = 100, train = features_train, ice = TRUE) %>%
autoplot(rug = TRUE, train = features_train, alpha = .1, center = TRUE) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("ICE")
Warning: `fun.y` is deprecated. Use `fun` instead.
Warning: Ignoring unknown parameters: csides
gridExtra::grid.arrange(pdp, ice, nrow = 2)
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat.id"]]` is discouraged. Use `.data[["yhat.id"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
stopCluster(cl)
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
# Heatmap with Contour
pdp_heat_con <- xgb.fit.final %>%
partial(pred.var = c("infections_3mma", "dom"), chull = TRUE,
progress = "text", train = features_train, levelplot = FALSE) %>%
autoplot(rug = TRUE,
contour = TRUE,
contour.color = "#F0FFF0",
train = features_train,
main = "Infections and Days on Market",
xlab = "Daily Infections (3mma)",
ylab = "Days on Market",
legend.title = "Sold Price"
)
|
| | 0%
|
|= | 1%
|
|== | 1%
|
|== | 2%
|
|=== | 2%
|
|=== | 3%
|
|==== | 3%
|
|==== | 4%
|
|===== | 4%
|
|===== | 5%
|
|====== | 5%
|
|====== | 6%
|
|======= | 6%
|
|======= | 7%
|
|======== | 7%
|
|======== | 8%
|
|========= | 8%
|
|========= | 9%
|
|========== | 9%
|
|========== | 10%
|
|=========== | 10%
|
|=========== | 11%
|
|============ | 11%
|
|============ | 12%
|
|============= | 12%
|
|============= | 13%
|
|============== | 13%
|
|============== | 14%
|
|=============== | 14%
|
|=============== | 15%
|
|================ | 15%
|
|================ | 16%
|
|================= | 16%
|
|================== | 17%
|
|=================== | 17%
|
|=================== | 18%
|
|==================== | 18%
|
|==================== | 19%
|
|===================== | 19%
|
|===================== | 20%
|
|====================== | 20%
|
|====================== | 21%
|
|======================= | 21%
|
|======================= | 22%
|
|======================== | 22%
|
|======================== | 23%
|
|========================= | 23%
|
|========================= | 24%
|
|========================== | 24%
|
|========================== | 25%
|
|=========================== | 25%
|
|=========================== | 26%
|
|============================ | 26%
|
|============================ | 27%
|
|============================= | 27%
|
|============================= | 28%
|
|============================== | 28%
|
|============================== | 29%
|
|=============================== | 29%
|
|=============================== | 30%
|
|================================ | 30%
|
|================================ | 31%
|
|================================= | 31%
|
|================================= | 32%
|
|================================== | 32%
|
|================================== | 33%
|
|=================================== | 33%
|
|==================================== | 34%
|
|===================================== | 34%
|
|===================================== | 35%
|
|====================================== | 35%
|
|====================================== | 36%
|
|======================================= | 36%
|
|======================================= | 37%
|
|======================================== | 37%
|
|======================================== | 38%
|
|========================================= | 38%
|
|========================================= | 39%
|
|========================================== | 39%
|
|========================================== | 40%
|
|=========================================== | 40%
|
|=========================================== | 41%
|
|============================================ | 41%
|
|============================================ | 42%
|
|============================================= | 42%
|
|============================================= | 43%
|
|============================================== | 43%
|
|============================================== | 44%
|
|=============================================== | 44%
|
|=============================================== | 45%
|
|================================================ | 45%
|
|================================================ | 46%
|
|================================================= | 46%
|
|================================================= | 47%
|
|================================================== | 47%
|
|================================================== | 48%
|
|=================================================== | 48%
|
|=================================================== | 49%
|
|==================================================== | 49%
|
|===================================================== | 50%
|
|====================================================== | 51%
|
|======================================================= | 51%
|
|======================================================= | 52%
|
|======================================================== | 52%
|
|======================================================== | 53%
|
|========================================================= | 53%
|
|========================================================= | 54%
|
|========================================================== | 54%
|
|========================================================== | 55%
|
|=========================================================== | 55%
|
|=========================================================== | 56%
|
|============================================================ | 56%
|
|============================================================ | 57%
|
|============================================================= | 57%
|
|============================================================= | 58%
|
|============================================================== | 58%
|
|============================================================== | 59%
|
|=============================================================== | 59%
|
|=============================================================== | 60%
|
|================================================================ | 60%
|
|================================================================ | 61%
|
|================================================================= | 61%
|
|================================================================= | 62%
|
|================================================================== | 62%
|
|================================================================== | 63%
|
|=================================================================== | 63%
|
|=================================================================== | 64%
|
|==================================================================== | 64%
|
|==================================================================== | 65%
|
|===================================================================== | 65%
|
|===================================================================== | 66%
|
|====================================================================== | 66%
|
|======================================================================= | 67%
|
|======================================================================== | 67%
|
|======================================================================== | 68%
|
|========================================================================= | 68%
|
|========================================================================= | 69%
|
|========================================================================== | 69%
|
|========================================================================== | 70%
|
|=========================================================================== | 70%
|
|=========================================================================== | 71%
|
|============================================================================ | 71%
|
|============================================================================ | 72%
|
|============================================================================= | 72%
|
|============================================================================= | 73%
|
|============================================================================== | 73%
|
|============================================================================== | 74%
|
|=============================================================================== | 74%
|
|=============================================================================== | 75%
|
|================================================================================ | 75%
|
|================================================================================ | 76%
|
|================================================================================= | 76%
|
|================================================================================= | 77%
|
|================================================================================== | 77%
|
|================================================================================== | 78%
|
|=================================================================================== | 78%
|
|=================================================================================== | 79%
|
|==================================================================================== | 79%
|
|==================================================================================== | 80%
|
|===================================================================================== | 80%
|
|===================================================================================== | 81%
|
|====================================================================================== | 81%
|
|====================================================================================== | 82%
|
|======================================================================================= | 82%
|
|======================================================================================= | 83%
|
|======================================================================================== | 83%
|
|========================================================================================= | 84%
|
|========================================================================================== | 84%
|
|========================================================================================== | 85%
|
|=========================================================================================== | 85%
|
|=========================================================================================== | 86%
|
|============================================================================================ | 86%
|
|============================================================================================ | 87%
|
|============================================================================================= | 87%
|
|============================================================================================= | 88%
|
|============================================================================================== | 88%
|
|============================================================================================== | 89%
|
|=============================================================================================== | 89%
|
|=============================================================================================== | 90%
|
|================================================================================================ | 90%
|
|================================================================================================ | 91%
|
|================================================================================================= | 91%
|
|================================================================================================= | 92%
|
|================================================================================================== | 92%
|
|================================================================================================== | 93%
|
|=================================================================================================== | 93%
|
|=================================================================================================== | 94%
|
|==================================================================================================== | 94%
|
|==================================================================================================== | 95%
|
|===================================================================================================== | 95%
|
|===================================================================================================== | 96%
|
|====================================================================================================== | 96%
|
|====================================================================================================== | 97%
|
|======================================================================================================= | 97%
|
|======================================================================================================= | 98%
|
|======================================================================================================== | 98%
|
|======================================================================================================== | 99%
|
|========================================================================================================= | 99%
|
|==========================================================================================================| 100%
# 3D Graphing
# Create 3D data matrix
infections_3d <- pdp_heat_con$data$infections_3mma
dom_3d <- pdp_heat_con$data$dom
yhat_3d <- pdp_heat_con$data$yhat
pdp_mat <- data.frame(infections_3d, dom_3d, yhat_3d) # Datafram for plotly 3D model
# Axis Titles
axx <- list(title = "Infections Daily")
axy <- list(title = "Days on Market")
axz <- list(title = "Price Sold")
# Colors: Manually matching plot standard gradient
very_low <- "#460f5c"
low <- "#2c728e"
med <- "#27ad81"
high <- "#f4e61e"
pdp_3d <- plot_ly(pdp_mat, x = ~infections_3d, y = ~dom_3d, z = ~yhat_3d,
type = 'mesh3d', intensity = ~yhat_3d,
colors = colorRamp(c(very_low, med, high)))
pdp_3d <- pdp_3d %>% layout(scene = list(xaxis=axx,yaxis=axy,zaxis=axz)) # Axis labs
pdp_3d <- hide_colorbar(pdp_3d) # Hide legend
# Print out
gridExtra::grid.arrange(pdp_heat_con, nrow = 1)
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[[2L]]` is discouraged. Use `.data[[2L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[[2L]]` is discouraged. Use `.data[[2L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
pdp_3d
stopCluster(cl)
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
pdp <- xgb.fit.final %>%
partial(pred.var = "age", n.trees = 1827, grid.resolution = 100, train = features_train) %>%
autoplot(rug = TRUE, train = features_train) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("PDP")
ice <- xgb.fit.final %>%
partial(pred.var = "age", n.trees = 1827, grid.resolution = 100, train = features_train, ice = TRUE) %>%
autoplot(rug = TRUE, train = features_train, alpha = .1, center = TRUE) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("ICE")
Warning: `fun.y` is deprecated. Use `fun` instead.
Warning: Ignoring unknown parameters: csides
gridExtra::grid.arrange(pdp, ice, nrow = 2)
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat.id"]]` is discouraged. Use `.data[["yhat.id"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
stopCluster(cl)
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
# Heatmap with Contour
pdp_heat_con <- xgb.fit.final %>%
partial(pred.var = c("infections_3mma", "age"), chull = TRUE,
progress = "text", train = features_train, levelplot = FALSE) %>%
autoplot(rug = TRUE,
contour = TRUE,
contour.color = "#F0FFF0",
train = features_train,
main = "Infections and Age of Property",
xlab = "Daily Infections (3mma)",
ylab = "Age of Property",
legend.title = "Sold Price"
)
|
| | 0%
|
|= | 0%
|
|= | 1%
|
|== | 1%
|
|== | 2%
|
|=== | 2%
|
|=== | 3%
|
|==== | 3%
|
|==== | 4%
|
|===== | 4%
|
|===== | 5%
|
|====== | 5%
|
|====== | 6%
|
|======= | 6%
|
|======= | 7%
|
|======== | 7%
|
|======== | 8%
|
|========= | 8%
|
|========= | 9%
|
|========== | 9%
|
|========== | 10%
|
|=========== | 10%
|
|=========== | 11%
|
|============ | 11%
|
|============ | 12%
|
|============= | 12%
|
|============= | 13%
|
|============== | 13%
|
|============== | 14%
|
|=============== | 14%
|
|=============== | 15%
|
|================ | 15%
|
|================ | 16%
Warning in .Internal(eval(expr, envir, enclos)) :
closing unused connection 12 (<-localhost:11516)
Warning in .Internal(eval(expr, envir, enclos)) :
closing unused connection 11 (<-localhost:11516)
Warning in .Internal(eval(expr, envir, enclos)) :
closing unused connection 10 (<-localhost:11516)
Warning in .Internal(eval(expr, envir, enclos)) :
closing unused connection 9 (<-localhost:11516)
Warning in .Internal(eval(expr, envir, enclos)) :
closing unused connection 8 (<-localhost:11516)
|
|================= | 16%
|
|================= | 17%
|
|================== | 17%
|
|================== | 18%
|
|=================== | 18%
|
|=================== | 19%
|
|==================== | 19%
|
|==================== | 20%
|
|===================== | 20%
|
|===================== | 21%
|
|====================== | 21%
|
|====================== | 22%
|
|======================= | 22%
|
|======================= | 23%
|
|======================== | 23%
|
|======================== | 24%
|
|========================= | 24%
|
|========================== | 25%
|
|=========================== | 26%
|
|============================ | 26%
|
|============================ | 27%
|
|============================= | 27%
|
|============================= | 28%
|
|============================== | 28%
|
|============================== | 29%
|
|=============================== | 29%
|
|=============================== | 30%
|
|================================ | 30%
|
|================================ | 31%
|
|================================= | 31%
|
|================================= | 32%
|
|================================== | 32%
|
|================================== | 33%
|
|=================================== | 33%
|
|=================================== | 34%
|
|==================================== | 34%
|
|==================================== | 35%
|
|===================================== | 35%
|
|===================================== | 36%
|
|====================================== | 36%
|
|====================================== | 37%
|
|======================================= | 37%
|
|======================================= | 38%
|
|======================================== | 38%
|
|======================================== | 39%
|
|========================================= | 39%
|
|========================================= | 40%
|
|========================================== | 40%
|
|========================================== | 41%
|
|=========================================== | 41%
|
|=========================================== | 42%
|
|============================================ | 42%
|
|============================================ | 43%
|
|============================================= | 43%
|
|============================================= | 44%
|
|============================================== | 44%
|
|============================================== | 45%
|
|=============================================== | 45%
|
|=============================================== | 46%
|
|================================================ | 46%
|
|================================================ | 47%
|
|================================================= | 47%
|
|================================================= | 48%
|
|================================================== | 48%
|
|================================================== | 49%
|
|=================================================== | 49%
|
|=================================================== | 50%
|
|==================================================== | 50%
|
|===================================================== | 50%
|
|===================================================== | 51%
|
|====================================================== | 51%
|
|====================================================== | 52%
|
|======================================================= | 52%
|
|======================================================= | 53%
|
|======================================================== | 53%
|
|======================================================== | 54%
|
|========================================================= | 54%
|
|========================================================= | 55%
|
|========================================================== | 55%
|
|========================================================== | 56%
|
|=========================================================== | 56%
|
|=========================================================== | 57%
|
|============================================================ | 57%
|
|============================================================ | 58%
|
|============================================================= | 58%
|
|============================================================= | 59%
|
|============================================================== | 59%
|
|============================================================== | 60%
|
|=============================================================== | 60%
|
|=============================================================== | 61%
|
|================================================================ | 61%
|
|================================================================ | 62%
|
|================================================================= | 62%
|
|================================================================= | 63%
|
|================================================================== | 63%
|
|================================================================== | 64%
|
|=================================================================== | 64%
|
|=================================================================== | 65%
|
|==================================================================== | 65%
|
|==================================================================== | 66%
|
|===================================================================== | 66%
|
|===================================================================== | 67%
|
|====================================================================== | 67%
|
|====================================================================== | 68%
|
|======================================================================= | 68%
|
|======================================================================= | 69%
|
|======================================================================== | 69%
|
|======================================================================== | 70%
|
|========================================================================= | 70%
|
|========================================================================= | 71%
|
|========================================================================== | 71%
|
|========================================================================== | 72%
|
|=========================================================================== | 72%
|
|=========================================================================== | 73%
|
|============================================================================ | 73%
|
|============================================================================ | 74%
|
|============================================================================= | 74%
|
|============================================================================== | 75%
|
|=============================================================================== | 76%
|
|================================================================================ | 76%
|
|================================================================================ | 77%
|
|================================================================================= | 77%
|
|================================================================================= | 78%
|
|================================================================================== | 78%
|
|================================================================================== | 79%
|
|=================================================================================== | 79%
|
|=================================================================================== | 80%
|
|==================================================================================== | 80%
|
|==================================================================================== | 81%
|
|===================================================================================== | 81%
|
|===================================================================================== | 82%
|
|====================================================================================== | 82%
|
|====================================================================================== | 83%
|
|======================================================================================= | 83%
|
|======================================================================================= | 84%
|
|======================================================================================== | 84%
|
|======================================================================================== | 85%
|
|========================================================================================= | 85%
|
|========================================================================================= | 86%
|
|========================================================================================== | 86%
|
|========================================================================================== | 87%
|
|=========================================================================================== | 87%
|
|=========================================================================================== | 88%
|
|============================================================================================ | 88%
|
|============================================================================================ | 89%
|
|============================================================================================= | 89%
|
|============================================================================================= | 90%
|
|============================================================================================== | 90%
|
|============================================================================================== | 91%
|
|=============================================================================================== | 91%
|
|=============================================================================================== | 92%
|
|================================================================================================ | 92%
|
|================================================================================================ | 93%
|
|================================================================================================= | 93%
|
|================================================================================================= | 94%
|
|================================================================================================== | 94%
|
|================================================================================================== | 95%
|
|=================================================================================================== | 95%
|
|=================================================================================================== | 96%
|
|==================================================================================================== | 96%
|
|==================================================================================================== | 97%
|
|===================================================================================================== | 97%
|
|===================================================================================================== | 98%
|
|====================================================================================================== | 98%
|
|====================================================================================================== | 99%
|
|======================================================================================================= | 99%
|
|======================================================================================================= | 100%
|
|========================================================================================================| 100%
# 3D Graphing
# Create 3D data matrix
infections_3d <- pdp_heat_con$data$infections_3mma
age_3d <- pdp_heat_con$data$age
yhat_3d <- pdp_heat_con$data$yhat
pdp_mat <- data.frame(infections_3d, age_3d, yhat_3d) # Datafram for plotly 3D model
# Axis Titles
axx <- list(title = "Infections Daily")
axy <- list(title = "Age of Property")
axz <- list(title = "Price Sold")
# Colors: Manually matching plot standard gradient
very_low <- "#460f5c"
low <- "#2c728e"
med <- "#27ad81"
high <- "#f4e61e"
pdp_3d <- plot_ly(pdp_mat, x = ~infections_3d, y = ~age_3d, z = ~yhat_3d,
type = 'mesh3d', intensity = ~yhat_3d,
colors = colorRamp(c(very_low, med, high)))
pdp_3d <- pdp_3d %>% layout(scene = list(xaxis=axx,yaxis=axy,zaxis=axz)) # Axis labs
pdp_3d <- hide_colorbar(pdp_3d) # Hide legend
# Print out
gridExtra::grid.arrange(pdp_heat_con, nrow = 1)
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[[2L]]` is discouraged. Use `.data[[2L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[[2L]]` is discouraged. Use `.data[[2L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
pdp_3d
stopCluster(cl)
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
pdp <- xgb.fit.final %>%
partial(pred.var = "sold_date", n.trees = 1827, grid.resolution = 100, train = features_train) %>%
autoplot(rug = TRUE, train = features_train) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("PDP")
ice <- xgb.fit.final %>%
partial(pred.var = "sold_date", n.trees = 1827, grid.resolution = 100, train = features_train, ice = TRUE) %>%
autoplot(rug = TRUE, train = features_train, alpha = .1, center = TRUE) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("ICE")
gridExtra::grid.arrange(pdp, ice, nrow = 2)
stopCluster(cl)
gridExtra::grid.arrange(pdp_heat_con, nrow = 1)
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[[2L]]` is discouraged. Use `.data[[2L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
Warning: Use of `object[[2L]]` is discouraged. Use `.data[[2L]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
pdp_3d
# Use parallel computing to speed up processing time
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
pdp <- xgb.fit.final %>%
partial(pred.var = "photo_count", n.trees = 1827, grid.resolution = 100, train = features_train) %>%
autoplot(rug = TRUE, train = features_train) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("PDP")
ice <- xgb.fit.final %>%
partial(pred.var = "photo_count", n.trees = 1827, grid.resolution = 100, train = features_train, ice = TRUE) %>%
autoplot(rug = TRUE, train = features_train, alpha = .1, center = TRUE) +
scale_y_continuous(labels = scales::dollar) +
ggtitle("ICE")
gridExtra::grid.arrange(pdp, ice, nrow = 2)
stopCluster(cl)
LIME
# one-hot encode the local observations to be assessed.
local_obs_onehot <- vtreat::prepare(treatplan, local_obs, varRestriction = new_vars)
# apply LIME
explainer <- lime(data.frame(features_train), xgb.fit.final)
explanation <- explain(local_obs_onehot, explainer, n_features = 5)
plot_features(explanation)
Predicting impact of higher cases of corona
# predict values for test data
feature_test <- features_test[1:2,]
pred <- predict(xgb.fit.final, feature_test)
pred
?predict
# results
caret::RMSE(pred, response_test)
## [1] 21319.3
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
# reproducibility
set.seed(2)
start_time <- proc.time()
xgb.fit.final <- xgboost(
data = features_train,
label = response_train,
nrounds = 950,
nfold = 5,
objective = "reg:linear", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)
end_time <- proc.time()
time_taken <- end_time - start_time
time_taken
stopCluster(cl)
End of Code