House pricing project

Librerías

library(tidyverse)
library(xgboost)
library(data.table)
library(randomForest)
library(caret)

housing <- fread("housing.csv", data.table = FALSE)
housing <- housing %>% 
  mutate(total_bedrooms = ifelse(is.na(total_bedrooms), 
                                 median(housing$total_bedrooms, na.rm = T), total_bedrooms))

housing$mean_bedrooms = housing$total_bedrooms/housing$households
housing$mean_rooms = housing$total_rooms/housing$households

housing <- housing %>% 
  select(-c(total_rooms,total_bedrooms))

#One hot encoding
categories = unique(housing$ocean_proximity)
cat_housing = data.frame(ocean_proximity = housing$ocean_proximity)

for(cat in categories){
    cat_housing[,cat] = rep(0, times= nrow(cat_housing))
}

for(i in 1:length(cat_housing$ocean_proximity)){
    cat = as.character(cat_housing$ocean_proximity[i])
    cat_housing[,cat][i] = 1
}

Estandarización variables numericas

cat_housing <- cat_housing %>% select(-ocean_proximity)

drops = c('ocean_proximity','median_house_value')
housing_num =  housing[ , !(names(housing) %in% drops)]

scaled_housing_num = scale(housing_num)

cleaned_housing = cbind(cat_housing, scaled_housing_num, median_house_value=housing$median_house_value)

Muestras Train -Test

set.seed(19) # Set a random seed so that same sample can be reproduced in future runs

sample = sample.int(n = nrow(cleaned_housing), size = floor(.8*nrow(cleaned_housing)), replace = F)
train = cleaned_housing[sample, ] #just the samples
test  = cleaned_housing[-sample, ] #everything but the samples


train_y = train[,'median_house_value']
train_x = train[, names(train) !='median_house_value']

test_y = test[,'median_house_value']
test_x = test[, names(test) !='median_house_value']

head(train)

##       NEAR BAY <1H OCEAN INLAND NEAR OCEAN ISLAND  longitude   latitude
## 7677         0         1      0          0      0  0.7385481 -0.8014511
## 9910         1         0      0          0      0 -1.3627414  1.2538381
## 4483         0         1      0          0      0  0.6886362 -0.7359066
## 14674        0         0      0          1      0  1.2276843 -1.3164439
## 19129        0         1      0          0      0 -1.5524064  1.2257476
## 4803         0         1      0          0      0  0.6037860 -0.7499518
##       housing_median_age  population  households median_income mean_bedrooms
## 7677          0.58483810 -0.63090564 -0.73376366    0.36804980   -0.26473554
## 9910          0.42592579 -0.23265833 -0.24204153   -0.21879474   -0.25116427
## 4483          0.26701348 -0.59823347 -0.88023409   -0.99718357   -0.24164088
## 14674        -0.92482882 -0.07459565 -0.48528706   -0.52408655   -0.09622247
## 19129         0.02864502 -0.68477058 -0.71022377   -0.02303953   -0.09299386
## 4803          0.50538194  0.21238967 -0.04587579   -1.17141044   -0.12643615
##        mean_rooms median_house_value
## 7677   0.06837588             170800
## 9910  -0.40973944             137500
## 4483  -0.81808817             137500
## 14674  0.10592753             112500
## 19129  0.13683122             306000
## 4803  -0.66896726             125900

Random Forest

rf_model = randomForest(train_x, y = train_y , ntree = 500, importance = TRUE)

names(rf_model) #these are all the different things you can call from the model.

##  [1] "call"            "type"            "predicted"       "mse"            
##  [5] "rsq"             "oob.times"       "importance"      "importanceSD"   
##  [9] "localImportance" "proximity"       "ntree"           "mtry"           
## [13] "forest"          "coefs"           "y"               "test"           
## [17] "inbag"

importance_dat = rf_model$importance
importance_dat

##                         %IncMSE IncNodePurity
## NEAR BAY            443660928.4  1.461671e+12
## <1H OCEAN          1546015541.3  4.616786e+12
## INLAND             3895405979.2  3.107474e+13
## NEAR OCEAN          456668450.0  2.109282e+12
## ISLAND                 597689.1  5.115434e+10
## longitude          6662896402.7  2.531535e+13
## latitude           5355943908.8  2.198212e+13
## housing_median_age 1032310411.4  9.719367e+12
## population         1072996419.2  7.567615e+12
## households         1157015112.3  8.025847e+12
## median_income      8358786022.8  7.218902e+13
## mean_bedrooms       415700691.6  7.498511e+12
## mean_rooms         1776288466.1  2.118150e+13

sorted_predictors = sort(importance_dat[,1], decreasing=TRUE)
sorted_predictors

##      median_income          longitude           latitude             INLAND 
##       8358786022.8       6662896402.7       5355943908.8       3895405979.2 
##         mean_rooms          <1H OCEAN         households         population 
##       1776288466.1       1546015541.3       1157015112.3       1072996419.2 
## housing_median_age         NEAR OCEAN           NEAR BAY      mean_bedrooms 
##       1032310411.4        456668450.0        443660928.4        415700691.6 
##             ISLAND 
##           597689.1

oob_prediction = predict(rf_model) #leaving out a data source forces OOB predictions

#you may have noticed that this is avaliable using the $mse in the model options.
#but this way we learn stuff!
train_mse = mean(as.numeric((oob_prediction - train_y)^2))
oob_rmse = sqrt(train_mse)
oob_rmse

## [1] 49203.08

y_pred_rf = predict(rf_model , test_x)
test_mse = mean(((y_pred_rf - test_y)^2))
test_rmse = sqrt(test_mse)
test_rmse # ~48620

## [1] 47806.95

rf_benchmark  <- 47806

XGBoost

Una forma más precisa de llamarlo es Gradiente boosteado regularizado (REgularized gradient Boost), el eXtreme hace referencia a eficiencia computacional

XGboost también es un bosque de arboles, pero esos arboles son construidos de forma aditiva a diferencia de un random forest. El algoritmo construye arboles de forma iterativa tal que se minimiaz el error, de manera que al final se obtiene un conjunto optimo de arbolespredictivos. Los arboles crecen de forma secuencial: cada arbol crece usando la información de los arboles previos. Cada arbol es un versión modificada del datase original basada en los arboles previamente construidos. En XGBoost los arboles vienen con parametros de regulariación que evitan el sobreajuste

#put into the xgb matrix format
dtrain = xgb.DMatrix(data =  as.matrix(train_x), label = train_y )
dtest = xgb.DMatrix(data =  as.matrix(test_x), label = test_y)

# these are the datasets the rmse is evaluated for at each iteration
watchlist = list(train=dtrain, test=dtest)


# try 1 - off a set of paramaters I know work pretty well for most stuff

bst = xgb.train(data = dtrain, 
                max.depth = 8, # Max profundida de los arboles
                eta = 0.3, # Tasa de aprendiazje
                nthread = 2, 
                nround = 1000,  # Numero de arboles
                watchlist = watchlist, 
                objective = "reg:linear", 
                early_stopping_rounds = 50, # Detención del algoritmo si en 50 arboles no ha mejorado
                print_every_n = 500)

## [1]  train-rmse:171584.296875    test-rmse:171376.609375 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## Stopping. Best iteration:
## [223]    train-rmse:8550.336914  test-rmse:46795.085938

Optimización de hiperparametros

Configuramos el set de validación

####
# Proper use - validation set
####


sample = sample.int(n = nrow(train), size = floor(.8*nrow(train)), replace = F)

train_t = train[sample, ] #just the samples
valid  = train[-sample, ] #everything but the samples

train_y = train_t[,'median_house_value']

#if tidyverse was used, dplyr pull function solves the problem:
#train_y = pull(train_t, median_house_value)
train_x = train_t[, names(train_t) !='median_house_value']

valid_y = valid[,'median_house_value']
valid_x = valid[, names(train_t) !='median_house_value']

train_y[1:10]

##  [1] 302000 340800 128500 113800 146900 177500 215400  73400 240500 500001

gb_train = xgb.DMatrix(data = as.matrix(train_x), label = train_y )
gb_valid = xgb.DMatrix(data = as.matrix(valid_x), label = valid_y )
#in jupyter the label needs to be in an as.matrix() or I get an error? subtle and annoying differences

# train xgb, evaluating against the validation
watchlist = list(train = gb_train, valid = gb_valid)

bst_slow = xgb.train(data= gb_train, 
                        max.depth = 10, 
                        eta = 0.01, 
                        nthread = 2, 
                        nround = 1000, 
                        watchlist = watchlist, 
                        objective = "reg:linear", 
                        early_stopping_rounds = 50,
                        print_every_n = 500)

## [1]  train-rmse:234876.875000    valid-rmse:234400.703125 
## Multiple eval metrics are present. Will use valid_rmse for early stopping.
## Will train until valid_rmse hasn't improved in 50 rounds.
## 
## [501]    train-rmse:21971.261719 valid-rmse:48034.664062 
## [1000]   train-rmse:14432.571289 valid-rmse:47050.019531

# recall we ran the following to get the test data in the right format:
# dtest = xgb.DMatrix(data =  as.matrix(test_x), label = test_y)
# here I have it with the label taken off, just to remind us its external data xgb would ignore the label though during predictions
dtest = xgb.DMatrix(data =  as.matrix(test_x))

#test the model on truly external data

y_hat_valid = predict(bst_slow, dtest)

test_mse = mean(((y_hat_valid - test_y)^2))
test_rmse = sqrt(test_mse)
test_rmse

## [1] 45921.76

GridSearch

# Hiper parametros
max.depths = c(7, 9)
etas = c(0.01, 0.001)

best_params = 0
best_score = 0

count = 1
for( depth in max.depths ){
    for( num in etas){

        bst_grid = xgb.train(data = gb_train, 
                                max.depth = depth, 
                                eta=num, 
                                nthread = 2, 
                                nround = 1000, 
                                watchlist = watchlist, 
                                objective = "reg:linear", 
                                early_stopping_rounds = 50, 
                                verbose=0)

        if(count == 1){
            best_params = bst_grid$params
            best_score = bst_grid$best_score
            count = count + 1
            }
        else if( bst_grid$best_score < best_score){
            best_params = bst_grid$params
            best_score = bst_grid$best_score
        }
    }
}

best_params

## $max_depth
## [1] 9
## 
## $eta
## [1] 0.01
## 
## $nthread
## [1] 2
## 
## $objective
## [1] "reg:linear"
## 
## $silent
## [1] 1

best_score

## valid-rmse 
##   47111.51

# max_depth of 9, eta of 0.01
bst_tuned = xgb.train( data = gb_train, 
                        max.depth = 7, 
                        eta = 0.01, 
                        nthread = 2, 
                        nround = 1000, 
                        watchlist = watchlist, 
                        objective = "reg:linear", 
                        early_stopping_rounds = 50,
                        print_every_n = 500)

## [1]  train-rmse:234899.500000    valid-rmse:234407.437500 
## Multiple eval metrics are present. Will use valid_rmse for early stopping.
## Will train until valid_rmse hasn't improved in 50 rounds.
## 
## [501]    train-rmse:38416.281250 valid-rmse:50135.722656 
## [1000]   train-rmse:30785.687500 valid-rmse:47815.984375

y_hat_xgb_grid = predict(bst_tuned, dtest)

test_mse = mean(((y_hat_xgb_grid - test_y)^2))
test_rmse = sqrt(test_mse)
test_rmse # test-rmse: 46675

## [1] 46544.54

test_rmse/rf_benchmark

## [1] 0.973613

Optmización con caret

# look up the model we are running to see the paramaters
modelLookup("xgbLinear")

##       model parameter                 label forReg forClass probModel
## 1 xgbLinear   nrounds # Boosting Iterations   TRUE     TRUE      TRUE
## 2 xgbLinear    lambda     L2 Regularization   TRUE     TRUE      TRUE
## 3 xgbLinear     alpha     L1 Regularization   TRUE     TRUE      TRUE
## 4 xgbLinear       eta         Learning Rate   TRUE     TRUE      TRUE

# set up all the pairwise combinations

xgb_grid_1 = expand.grid(nrounds = c(1000,2000,3000,4000) ,
                            eta = c(0.01, 0.001, 0.0001),
                            lambda = 1,
                            alpha = 0)
xgb_grid_1

##    nrounds   eta lambda alpha
## 1     1000 1e-02      1     0
## 2     2000 1e-02      1     0
## 3     3000 1e-02      1     0
## 4     4000 1e-02      1     0
## 5     1000 1e-03      1     0
## 6     2000 1e-03      1     0
## 7     3000 1e-03      1     0
## 8     4000 1e-03      1     0
## 9     1000 1e-04      1     0
## 10    2000 1e-04      1     0
## 11    3000 1e-04      1     0
## 12    4000 1e-04      1     0

#here we do one better then a validation set, we use cross validation to 
#expand the amount of info we have!
xgb_trcontrol_1 = trainControl(method = "cv",
                                number = 5,
                                verboseIter = TRUE,
                                returnData = FALSE,
                                returnResamp = "all", 
                                allowParallel = TRUE)

######
#below a grid-search, cross-validation xgboost model in caret
######


xgb_train_1 = train(x = as.matrix(train_x),
                    y = train_y,
                    trControl = xgb_trcontrol_1,
                    tuneGrid = xgb_grid_1,
                    method = "xgbLinear",
                    max.depth = 5)

## + Fold1: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 1000, lambda = 1, alpha = 0, eta = 1e-04 on full training set

names(xgb_train_1)

##  [1] "method"       "modelInfo"    "modelType"    "results"      "pred"        
##  [6] "bestTune"     "call"         "dots"         "metric"       "control"     
## [11] "finalModel"   "preProcess"   "trainingData" "resample"     "resampledCM" 
## [16] "perfNames"    "maximize"     "yLimits"      "times"        "levels"

xgb_train_1$bestTune

##   nrounds lambda alpha   eta
## 1    1000      1     0 1e-04

xgb_train_1$method

## [1] "xgbLinear"

summary(xgb_train_1)

##               Length  Class              Mode       
## handle              1 xgb.Booster.handle externalptr
## raw           2039214 -none-             raw        
## niter               1 -none-             numeric    
## call                6 -none-             call       
## params              5 -none-             list       
## callbacks           1 -none-             list       
## feature_names      13 -none-             character  
## nfeatures           1 -none-             numeric    
## xNames             13 -none-             character  
## problemType         1 -none-             character  
## tuneValue           4 data.frame         list       
## obsLevels           1 -none-             logical    
## param               1 -none-             list

#alternatively, you can 'narrow in' on the best paramaters. Repeat the above by taking a range of options around 
#the best values found and seeing if high resolution tweaks can provide even further improvements.

xgb_cv_yhat = predict(xgb_train_1 , as.matrix(test_x))


test_mse = mean(((xgb_cv_yhat - test_y)^2))
test_rmse = sqrt(test_mse)
test_rmse # 46641... pretty close to the 'by hand' grid search!

## [1] 46672.62

Ensamble todos los modelos

Podemos ensamblar los resultados al final si lo más importante el la predicción que la interpretación del modelo.

#y_pred_rf #random forest
#y_hat_valid #xgBoost with validation
#y_hat_xgb_grid #xgBoost grid search
#xgb_cv_yhat #xgBoost caret cross validation

length(y_hat_xgb_grid)

## [1] 4128

blend_pred = (y_hat_valid * .25) + (y_pred_rf * .25) + (xgb_cv_yhat * .25) + (y_hat_xgb_grid * .25)
length(blend_pred)

## [1] 4128

length(blend_pred) == length(y_hat_xgb_grid)

## [1] TRUE

blend_test_mse = mean(((blend_pred - test_y)^2))
blend_test_rmse = sqrt(blend_test_mse)
blend_test_rmse

## [1] 44684.51