Code Chunks that take long to run and are not needed for the final model used are set to eval false (instead of commenting out for better legibility).
There are two sets of data, i) training data that has the actual prices ii) out of sample data that has the asking prices. Load both data sets.
Make sure you understand what information each column contains. Note that not all information provided might be useful in predicting house prices, but do not make any assumptions before you decide what information you use in your prediction algorithms.
#read in the data
london_house_prices_2019_training<-read.csv("training_data_assignment_with_prices.csv")
london_house_prices_2019_out_of_sample<-read.csv("test_data_assignment.csv")
#fix data types in both data sets
#fix dates
london_house_prices_2019_training <- london_house_prices_2019_training %>% mutate(date=as.Date(date))
london_house_prices_2019_out_of_sample<-london_house_prices_2019_out_of_sample %>% mutate(date=as.Date(date))
#change characters to factors
london_house_prices_2019_training <- london_house_prices_2019_training %>% mutate_if(is.character,as.factor)
london_house_prices_2019_out_of_sample<-london_house_prices_2019_out_of_sample %>% mutate_if(is.character,as.factor)
#take a quick look at what's in the data
str(london_house_prices_2019_training)## 'data.frame': 13998 obs. of 37 variables:
## $ ID : int 2 3 4 5 7 8 9 10 11 12 ...
## $ date : Date, format: "2019-11-01" "2019-08-08" ...
## $ postcode : Factor w/ 12635 levels "BR1 1AB","BR1 1LR",..: 10897 11027 11264 2031 11241 11066 421 9594 9444 873 ...
## $ property_type : Factor w/ 4 levels "D","F","S","T": 2 2 3 2 3 2 1 4 4 2 ...
## $ whether_old_or_new : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ freehold_or_leasehold : Factor w/ 2 levels "F","L": 2 2 1 2 1 2 1 1 1 2 ...
## $ address1 : Factor w/ 2825 levels "1","1 - 2","1 - 3",..: 2503 792 253 789 569 234 264 418 5 274 ...
## $ address2 : Factor w/ 434 levels "1","10","101",..: 372 NA NA NA NA NA NA NA NA NA ...
## $ address3 : Factor w/ 8543 levels "ABBERTON WALK",..: 6990 6821 3715 2492 4168 2879 3620 5251 6045 6892 ...
## $ town : Factor w/ 133 levels "ABBEY WOOD","ACTON",..: NA NA NA 78 NA NA NA NA NA NA ...
## $ local_aut : Factor w/ 69 levels "ASHFORD","BARKING",..: 36 46 24 36 24 46 65 36 36 17 ...
## $ county : Factor w/ 33 levels "BARKING AND DAGENHAM",..: 22 27 18 25 18 27 5 27 32 8 ...
## $ postcode_short : Factor w/ 247 levels "BR1","BR2","BR3",..: 190 194 198 28 198 194 4 169 167 8 ...
## $ current_energy_rating : Factor w/ 6 levels "B","C","D","E",..: 4 3 3 4 3 2 4 3 4 2 ...
## $ total_floor_area : num 30 50 100 39 88 101 136 148 186 65 ...
## $ number_habitable_rooms : int 2 2 5 2 4 4 6 6 6 3 ...
## $ co2_emissions_current : num 2.3 3 3.7 2.8 3.9 3.1 8.1 5.6 10 1.5 ...
## $ co2_emissions_potential : num 1.7 1.7 1.5 1.1 1.4 1.4 4.1 2 6.1 1.5 ...
## $ energy_consumption_current : int 463 313 212 374 251 175 339 216 308 128 ...
## $ energy_consumption_potential: int 344 175 82 144 90 77 168 75 186 128 ...
## $ windows_energy_eff : Factor w/ 5 levels "Average","Good",..: 1 1 1 5 1 1 1 1 5 1 ...
## $ tenure : Factor w/ 3 levels "owner-occupied",..: 1 2 1 2 1 1 1 2 1 1 ...
## $ latitude : num 51.5 51.5 51.5 51.6 51.5 ...
## $ longitude : num -0.1229 -0.2828 -0.4315 0.0423 -0.4293 ...
## $ population : int 34 75 83 211 73 51 25 91 60 97 ...
## $ altitude : int 8 9 25 11 21 11 95 7 7 106 ...
## $ london_zone : int 1 3 5 3 6 6 3 2 2 3 ...
## $ nearest_station : Factor w/ 592 levels "abbey road","abbey wood",..: 478 358 235 319 180 502 566 30 32 566 ...
## $ water_company : Factor w/ 5 levels "Affinity Water",..: 5 5 1 5 1 5 5 5 5 5 ...
## $ average_income : int 57200 61900 50600 45400 49000 56200 57200 65600 50400 52300 ...
## $ district : Factor w/ 33 levels "Barking and Dagenham",..: 22 27 18 26 18 27 5 27 32 8 ...
## $ price : num 360000 408500 499950 259999 395000 ...
## $ type_of_closest_station : Factor w/ 3 levels "light_rail","rail",..: 3 2 3 1 3 2 1 3 1 1 ...
## $ num_tube_lines : int 1 0 1 0 1 0 0 2 0 0 ...
## $ num_rail_lines : int 0 1 1 0 1 1 0 0 1 0 ...
## $ num_light_rail_lines : int 0 0 0 1 0 0 1 0 1 1 ...
## $ distance_to_station : num 0.528 0.77 0.853 0.29 1.073 ...
str(london_house_prices_2019_out_of_sample)## 'data.frame': 1999 obs. of 37 variables:
## $ ID : int 14434 12562 8866 10721 1057 1527 13961 12108 9363 1155 ...
## $ date : Date, format: NA NA ...
## $ postcode : logi NA NA NA NA NA NA ...
## $ property_type : Factor w/ 4 levels "D","F","S","T": 1 2 2 3 4 3 2 3 2 4 ...
## $ whether_old_or_new : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ freehold_or_leasehold : Factor w/ 2 levels "F","L": 1 2 2 1 1 1 2 1 2 1 ...
## $ address1 : logi NA NA NA NA NA NA ...
## $ address2 : logi NA NA NA NA NA NA ...
## $ address3 : logi NA NA NA NA NA NA ...
## $ town : Factor w/ 54 levels "ACTON","ADDISCOMBE",..: NA NA NA NA NA NA NA NA NA NA ...
## $ local_aut : logi NA NA NA NA NA NA ...
## $ county : logi NA NA NA NA NA NA ...
## $ postcode_short : Factor w/ 221 levels "BR1","BR2","BR3",..: 82 50 37 52 214 150 159 115 175 126 ...
## $ current_energy_rating : Factor w/ 6 levels "B","C","D","E",..: 3 2 3 3 4 4 4 3 4 3 ...
## $ total_floor_area : num 150 59 58 74 97.3 ...
## $ number_habitable_rooms : int 6 2 2 5 5 5 5 4 2 5 ...
## $ co2_emissions_current : num 7.3 1.5 2.8 3.5 6.5 4.9 5.1 2.9 4.2 4.3 ...
## $ co2_emissions_potential : num 2.4 1.4 1.2 1.2 5.7 1.6 3 0.8 3.2 2.5 ...
## $ energy_consumption_current : int 274 142 253 256 303 309 240 224 458 253 ...
## $ energy_consumption_potential: int 89 136 110 80 266 101 140 58 357 143 ...
## $ windows_energy_eff : Factor w/ 5 levels "Average","Good",..: 1 1 1 1 1 1 3 1 3 1 ...
## $ tenure : Factor w/ 3 levels "owner-occupied",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ latitude : num 51.6 51.6 51.5 51.6 51.5 ...
## $ longitude : num -0.129 -0.2966 -0.0328 -0.3744 -0.2576 ...
## $ population : int 87 79 23 73 100 24 22 49 65 98 ...
## $ altitude : int 63 38 17 39 8 46 26 16 14 18 ...
## $ london_zone : int 4 4 2 5 2 4 3 6 1 3 ...
## $ nearest_station : Factor w/ 494 levels "abbey wood","acton central",..: 16 454 181 302 431 142 20 434 122 212 ...
## $ water_company : Factor w/ 4 levels "Affinity Water",..: 4 1 4 1 4 4 4 2 4 4 ...
## $ average_income : int 61300 48900 46200 52200 60700 59600 64000 48100 56600 53500 ...
## $ district : Factor w/ 32 levels "Barking and Dagenham",..: 9 4 29 14 17 10 31 15 19 22 ...
## $ type_of_closest_station : Factor w/ 3 levels "light_rail","rail",..: 3 3 1 2 3 2 3 3 3 2 ...
## $ num_tube_lines : int 1 2 0 0 2 0 1 1 2 0 ...
## $ num_rail_lines : int 0 1 0 1 0 1 1 0 0 1 ...
## $ num_light_rail_lines : int 0 1 1 0 0 0 0 1 0 0 ...
## $ distance_to_station : num 0.839 0.104 0.914 0.766 0.449 ...
## $ asking_price : num 750000 229000 152000 379000 930000 350000 688000 386000 534000 459000 ...
#let's do the initial split
library(rsample)
set.seed(123)
train_test_split <- initial_split(london_house_prices_2019_training, prop = 0.75) #training set contains 75% of the data
# Create the training dataset
train_data <- training(train_test_split)
test_data <- testing(train_test_split)lr_results <- data_frame(model_name = as.character(),
RMSE = as.double(),
Rsquare = as.double())## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
m_comp_table <- function(model, m_name){
prediction <- predict(model, test_data)
if(m_name %in% lr_results$model_name){
lr_results[lr_results$model_name == m_name, 1] <<- m_name
lr_results[lr_results$model_name == m_name, 2] <<- RMSE(prediction, test_data$price)
lr_results[lr_results$model_name == m_name, 3] <<- R2(prediction, test_data$price)
}
else{
lr_results <<- lr_results %>%
add_row(model_name = m_name,
RMSE = RMSE(prediction, test_data$price),
Rsquare = R2(prediction, test_data$price))
}
return(lr_results)
}var_importance <- function(model, m_name){
importance <- varImp(model, scale=TRUE)
plot(importance,
main = m_name)
}Visualize and examine the data. What plots could be useful here? What do you learn from these visualizations?
library(ggplot2)
london_house_prices_2019_training %>%
ggplot(aes(x = price,
y = total_floor_area))+
geom_point(aes(color = number_habitable_rooms))+
scale_x_log10()+
geom_smooth(method = "lm")+
labs(title = "Price and Total Floor Area are strongly Correlated",
y = "Total Floor Area",
x = "price",
color= "number of habitable rooms")+
theme_minimal()+
scale_color_binned(type = "viridis")## `geom_smooth()` using formula 'y ~ x'
london_house_prices_2019_training %>%
ggplot(aes(x = price)) +
geom_histogram()+
labs(title = "Prices are Right Skewed")+
theme_minimal()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
london_house_prices_2019_training %>%
ggplot(aes(x = total_floor_area))+
geom_histogram()+
labs(title = "Distribution of Apartment Sizes")+
theme_minimal()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
london_house_prices_2019_training %>%
ggplot(aes(x = number_habitable_rooms))+
geom_histogram()+
labs(title = "Distribution of Number of Rooms")+
theme_minimal()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# there is a week negative correlation between price and distance to station
london_house_prices_2019_training %>%
ggplot(aes(x = distance_to_station, y = price))+
geom_point()+
geom_smooth(method = "lm")+
labs(title = "Price and Distance to Station correlation",
x = "distance to station",
y = "price")+
theme_minimal()## `geom_smooth()` using formula 'y ~ x'
london_house_prices_2019_training %>%
ggplot(aes(x = (total_floor_area)))+
geom_histogram()+
labs(title = "Total Floor Area Distribution")+
theme_minimal()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Estimate a correlation table between prices and other continuous variables. What do you glean from the correlation table?
# produce a correlation table using GGally::ggcor()
# this takes a while to plot
library(tidyverse)
library("GGally")
london_house_prices_2019_training %>%
select(-ID) %>% #keep Y variable last
ggcorr(method = c("pairwise", "pearson"), layout.exp = 2,label_round=2, label = TRUE,label_size = 2,hjust = 1,nbreaks = 5,size = 2,angle = -20)To help you get started I build a linear regression model below. I chose a subset of the features with no particular goal. You can (and should) add more variables and/or choose variable selection methods if you want.
set.seed(123)
#Define control variables
control <- trainControl (
method="cv",
number=5,
verboseIter=TRUE) #by setting this to true the model will report its progress after each estimation
#we are going to train the model and report the results using k-fold cross validation
model1_lm<-train(
price ~ distance_to_station +water_company+property_type+whether_old_or_new+freehold_or_leasehold+latitude+ longitude,
train_data,
method = "lm",
trControl = control
)## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
summary(model1_lm)##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -841408 -197212 -68223 67842 9993869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -47499527 3776562 -12.577 < 2e-16 ***
## distance_to_station -150809 12588 -11.981 < 2e-16 ***
## `water_companyEssex & Suffolk Water` 407666 29859 13.653 < 2e-16 ***
## `water_companyLeep Utilities` 507926 277996 1.827 0.0677 .
## `water_companySES Water` 279019 30330 9.199 < 2e-16 ***
## `water_companyThames Water` 503766 20154 24.995 < 2e-16 ***
## property_typeF -489809 41899 -11.690 < 2e-16 ***
## property_typeS -346982 22026 -15.753 < 2e-16 ***
## property_typeT -388838 21134 -18.398 < 2e-16 ***
## whether_old_or_newY 22178 181661 0.122 0.9028
## freehold_or_leaseholdL -170308 36726 -4.637 3.57e-06 ***
## latitude 934540 73207 12.766 < 2e-16 ***
## longitude -1036285 41392 -25.036 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 480100 on 10485 degrees of freedom
## Multiple R-squared: 0.1802, Adjusted R-squared: 0.1792
## F-statistic: 192 on 12 and 10485 DF, p-value: < 2.2e-16
model2_lm<-train(
price ~ average_income:london_zone+
district +
water_company+
property_type+
freehold_or_leasehold+
latitude*longitude*total_floor_area*altitude+
average_income:number_habitable_rooms:total_floor_area+
I(average_income^3)+
energy_consumption_current+
energy_consumption_potential+
windows_energy_eff+
co2_emissions_current:co2_emissions_potential+
current_energy_rating+
distance_to_station+
num_tube_lines+
type_of_closest_station+
num_rail_lines+
num_light_rail_lines
,
metric = "RMSE",
na.action = na.omit,
train_data,
method = "lm",
trControl = control
)## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
# summary of the results
summary(model2_lm)##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2135160 -81814 4761 84749 6234747
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -7.029e+07 1.589e+07
## districtBarnet 2.100e+05 4.892e+04
## districtBexley 4.394e+04 4.500e+04
## districtBrent 3.038e+05 5.329e+04
## districtBromley 7.033e+04 4.877e+04
## districtCamden 4.307e+05 4.779e+04
## `districtCity of London` 4.089e+05 1.314e+05
## districtCroydon 2.912e+04 4.917e+04
## districtEaling 2.770e+05 5.576e+04
## districtEnfield 1.556e+05 4.443e+04
## districtGreenwich 4.447e+04 4.308e+04
## districtHackney 2.477e+05 4.351e+04
## `districtHammersmith and Fulham` 4.370e+05 5.297e+04
## districtHaringey 2.248e+05 4.602e+04
## districtHarrow 3.289e+05 6.143e+04
## districtHavering 3.826e+04 2.734e+04
## districtHillingdon 3.977e+05 6.339e+04
## districtHounslow 2.777e+05 5.668e+04
## districtIslington 3.344e+05 4.614e+04
## `districtKensington and Chelsea` 1.122e+06 5.132e+04
## `districtKingston upon Thames` 1.677e+05 5.401e+04
## districtLambeth 1.611e+05 4.745e+04
## districtLewisham 4.921e+04 4.532e+04
## districtMerton 1.055e+05 5.131e+04
## districtNewham 7.886e+02 4.133e+04
## districtRedbridge -2.962e+03 3.117e+04
## `districtRichmond upon Thames` 3.132e+05 5.557e+04
## districtSouthwark 1.872e+05 4.498e+04
## districtSutton 3.721e+04 5.498e+04
## `districtTower Hamlets` 1.046e+05 4.412e+04
## `districtWaltham Forest` 1.312e+05 4.056e+04
## districtWandsworth 1.786e+05 4.879e+04
## districtWestminster 7.502e+05 5.018e+04
## `water_companyEssex & Suffolk Water` -7.984e+04 3.681e+04
## `water_companyLeep Utilities` 1.537e+04 1.605e+05
## `water_companySES Water` 3.120e+04 3.099e+04
## `water_companyThames Water` 5.440e+03 1.940e+04
## property_typeF -4.347e+04 2.531e+04
## property_typeS -3.941e+04 1.365e+04
## property_typeT -1.383e+04 1.392e+04
## freehold_or_leaseholdL -5.599e+03 2.137e+04
## latitude 1.365e+06 3.083e+05
## longitude -1.185e+09 8.369e+07
## total_floor_area 5.644e+05 1.111e+05
## altitude 2.115e+06 2.233e+05
## `I(average_income^3)` 9.267e-10 4.653e-11
## energy_consumption_current 2.324e+02 6.404e+01
## energy_consumption_potential -2.097e+02 5.471e+01
## windows_energy_effGood 2.062e+04 6.882e+03
## windows_energy_effPoor 4.399e+04 1.017e+04
## `windows_energy_effVery Good` 5.112e+04 1.381e+05
## `windows_energy_effVery Poor` 2.203e+04 9.368e+03
## current_energy_ratingC -2.924e+04 1.823e+04
## current_energy_ratingD -3.733e+04 1.968e+04
## current_energy_ratingE -7.635e+04 2.330e+04
## current_energy_ratingF -1.673e+05 3.254e+04
## current_energy_ratingG -2.292e+05 5.079e+04
## distance_to_station -1.847e+03 7.995e+03
## num_tube_lines 8.484e+03 7.898e+03
## type_of_closest_stationrail -6.076e+04 2.001e+04
## type_of_closest_stationtube -2.870e+04 1.911e+04
## num_rail_lines 1.590e+04 1.039e+04
## num_light_rail_lines -4.883e+04 1.601e+04
## `average_income:london_zone` -1.025e+00 7.546e-02
## `latitude:longitude` 2.305e+07 1.626e+06
## `latitude:total_floor_area` -1.086e+04 2.157e+03
## `longitude:total_floor_area` 9.353e+06 6.487e+05
## `latitude:altitude` -4.105e+04 4.340e+03
## `longitude:altitude` 1.706e+07 1.352e+06
## `total_floor_area:altitude` -1.991e+04 1.848e+03
## `co2_emissions_current:co2_emissions_potential` 3.633e+03 2.638e+02
## `latitude:longitude:total_floor_area` -1.820e+05 1.260e+04
## `latitude:longitude:altitude` -3.318e+05 2.625e+04
## `latitude:total_floor_area:altitude` 3.866e+02 3.592e+01
## `longitude:total_floor_area:altitude` -1.483e+05 1.091e+04
## `average_income:total_floor_area:number_habitable_rooms` 1.797e-03 3.377e-04
## `latitude:longitude:total_floor_area:altitude` 2.887e+03 2.120e+02
## t value Pr(>|t|)
## (Intercept) -4.423 9.82e-06 ***
## districtBarnet 4.292 1.79e-05 ***
## districtBexley 0.977 0.328801
## districtBrent 5.701 1.22e-08 ***
## districtBromley 1.442 0.149304
## districtCamden 9.012 < 2e-16 ***
## `districtCity of London` 3.112 0.001861 **
## districtCroydon 0.592 0.553696
## districtEaling 4.967 6.90e-07 ***
## districtEnfield 3.502 0.000463 ***
## districtGreenwich 1.032 0.301986
## districtHackney 5.693 1.28e-08 ***
## `districtHammersmith and Fulham` 8.250 < 2e-16 ***
## districtHaringey 4.885 1.05e-06 ***
## districtHarrow 5.355 8.75e-08 ***
## districtHavering 1.400 0.161673
## districtHillingdon 6.274 3.65e-10 ***
## districtHounslow 4.900 9.75e-07 ***
## districtIslington 7.249 4.51e-13 ***
## `districtKensington and Chelsea` 21.857 < 2e-16 ***
## `districtKingston upon Thames` 3.104 0.001912 **
## districtLambeth 3.396 0.000686 ***
## districtLewisham 1.086 0.277510
## districtMerton 2.057 0.039732 *
## districtNewham 0.019 0.984778
## districtRedbridge -0.095 0.924291
## `districtRichmond upon Thames` 5.637 1.77e-08 ***
## districtSouthwark 4.162 3.18e-05 ***
## districtSutton 0.677 0.498613
## `districtTower Hamlets` 2.371 0.017762 *
## `districtWaltham Forest` 3.234 0.001226 **
## districtWandsworth 3.661 0.000252 ***
## districtWestminster 14.950 < 2e-16 ***
## `water_companyEssex & Suffolk Water` -2.169 0.030110 *
## `water_companyLeep Utilities` 0.096 0.923743
## `water_companySES Water` 1.007 0.314115
## `water_companyThames Water` 0.280 0.779211
## property_typeF -1.718 0.085893 .
## property_typeS -2.888 0.003886 **
## property_typeT -0.994 0.320370
## freehold_or_leaseholdL -0.262 0.793300
## latitude 4.426 9.70e-06 ***
## longitude -14.158 < 2e-16 ***
## total_floor_area 5.080 3.83e-07 ***
## altitude 9.467 < 2e-16 ***
## `I(average_income^3)` 19.916 < 2e-16 ***
## energy_consumption_current 3.630 0.000285 ***
## energy_consumption_potential -3.834 0.000127 ***
## windows_energy_effGood 2.996 0.002742 **
## windows_energy_effPoor 4.324 1.55e-05 ***
## `windows_energy_effVery Good` 0.370 0.711248
## `windows_energy_effVery Poor` 2.352 0.018692 *
## current_energy_ratingC -1.604 0.108747
## current_energy_ratingD -1.897 0.057888 .
## current_energy_ratingE -3.277 0.001052 **
## current_energy_ratingF -5.142 2.77e-07 ***
## current_energy_ratingG -4.512 6.49e-06 ***
## distance_to_station -0.231 0.817296
## num_tube_lines 1.074 0.282729
## type_of_closest_stationrail -3.036 0.002400 **
## type_of_closest_stationtube -1.502 0.133090
## num_rail_lines 1.531 0.125918
## num_light_rail_lines -3.049 0.002301 **
## `average_income:london_zone` -13.586 < 2e-16 ***
## `latitude:longitude` 14.180 < 2e-16 ***
## `latitude:total_floor_area` -5.036 4.83e-07 ***
## `longitude:total_floor_area` 14.419 < 2e-16 ***
## `latitude:altitude` -9.459 < 2e-16 ***
## `longitude:altitude` 12.621 < 2e-16 ***
## `total_floor_area:altitude` -10.773 < 2e-16 ***
## `co2_emissions_current:co2_emissions_potential` 13.771 < 2e-16 ***
## `latitude:longitude:total_floor_area` -14.444 < 2e-16 ***
## `latitude:longitude:altitude` -12.640 < 2e-16 ***
## `latitude:total_floor_area:altitude` 10.763 < 2e-16 ***
## `longitude:total_floor_area:altitude` -13.597 < 2e-16 ***
## `average_income:total_floor_area:number_habitable_rooms` 5.320 1.06e-07 ***
## `latitude:longitude:total_floor_area:altitude` 13.619 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 275100 on 10421 degrees of freedom
## Multiple R-squared: 0.7325, Adjusted R-squared: 0.7306
## F-statistic: 375.5 on 76 and 10421 DF, p-value: < 2.2e-16
colnames(train_data)## [1] "ID" "date"
## [3] "postcode" "property_type"
## [5] "whether_old_or_new" "freehold_or_leasehold"
## [7] "address1" "address2"
## [9] "address3" "town"
## [11] "local_aut" "county"
## [13] "postcode_short" "current_energy_rating"
## [15] "total_floor_area" "number_habitable_rooms"
## [17] "co2_emissions_current" "co2_emissions_potential"
## [19] "energy_consumption_current" "energy_consumption_potential"
## [21] "windows_energy_eff" "tenure"
## [23] "latitude" "longitude"
## [25] "population" "altitude"
## [27] "london_zone" "nearest_station"
## [29] "water_company" "average_income"
## [31] "district" "price"
## [33] "type_of_closest_station" "num_tube_lines"
## [35] "num_rail_lines" "num_light_rail_lines"
## [37] "distance_to_station"
glimpse(train_data)## Rows: 10,498
## Columns: 37
## $ ID <int> 2811, 2869, 11930, 9984, 14268, 3421, 212…
## $ date <date> 2019-08-23, 2019-01-04, 2019-11-08, 2019…
## $ postcode <fct> SW8 3PE, SE10 8UJ, SW20 9AJ, RM12 5LU, SE…
## $ property_type <fct> F, F, S, T, S, F, F, S, T, T, F, T, T, F,…
## $ whether_old_or_new <fct> N, N, N, N, N, N, N, N, N, N, N, N, N, N,…
## $ freehold_or_leasehold <fct> L, L, F, F, F, L, L, F, F, F, L, F, F, L,…
## $ address1 <fct> "89", "69A", "73", "24", "2", "CAMELLIA H…
## $ address2 <fct> NA, NA, NA, NA, NA, FLAT 56, FLAT 1, NA, …
## $ address3 <fct> INGELOW ROAD, ASHBURNHAM GROVE, AYLWARD R…
## $ town <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ local_aut <fct> LONDON, LONDON, LONDON, HORNCHURCH, LONDO…
## $ county <fct> WANDSWORTH, GREENWICH, MERTON, HAVERING, …
## $ postcode_short <fct> SW8, SE10, SW20, RM12, SE25, TW13, SW11, …
## $ current_energy_rating <fct> C, D, E, D, F, B, D, C, D, E, C, D, D, C,…
## $ total_floor_area <dbl> 52.00, 42.00, 167.00, 86.00, 98.00, 60.00…
## $ number_habitable_rooms <int> 3, 2, 7, 4, 5, 3, 4, 8, 3, 6, 2, 7, 4, 4,…
## $ co2_emissions_current <dbl> 3.3, 2.0, 8.5, 5.5, 8.1, 2.1, 2.6, 5.2, 3…
## $ co2_emissions_potential <dbl> 2.6, 1.1, 3.8, 1.0, 2.5, 1.1, 2.1, 3.0, 0…
## $ energy_consumption_current <int> 243, 269, 289, 364, 471, 207, 221, 163, 2…
## $ energy_consumption_potential <int> 195, 150, 127, 66, 144, 113, 180, 95, 51,…
## $ windows_energy_eff <fct> Average, Average, Good, Average, Average,…
## $ tenure <fct> owner-occupied, owner-occupied, owner-occ…
## $ latitude <dbl> 51.46932, 51.47542, 51.40642, 51.54360, 5…
## $ longitude <dbl> -0.150607, -0.015359, -0.209380, 0.204097…
## $ population <int> 147, 109, 147, 76, 113, 156, 132, 80, 143…
## $ altitude <int> 3, 9, 16, 12, 68, 22, 23, 16, 16, 55, 6, …
## $ london_zone <int> 2, 2, 4, 6, 4, 6, 2, 4, 4, 5, 4, 3, 2, 3,…
## $ nearest_station <fct> queenstown road, greenwich, south merton,…
## $ water_company <fct> Thames Water, Thames Water, Thames Water,…
## $ average_income <int> 58900, 61400, 72900, 48200, 51400, 53700,…
## $ district <fct> Lambeth, Greenwich, Merton, Havering, Cro…
## $ price <dbl> 748500, 422000, 925000, 375000, 567000, 2…
## $ type_of_closest_station <fct> rail, light_rail, rail, tube, light_rail,…
## $ num_tube_lines <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1,…
## $ num_rail_lines <int> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,…
## $ num_light_rail_lines <int> 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,…
## $ distance_to_station <dbl> 0.5381324, 0.2299180, 0.4387084, 0.610214…
# anova(model1_lm$finalModel, model2_lm$finalModel)set.seed(123)
# load the library
library(Boruta)
# load the data
features_remove <- c("price", "address1", "address2", "address3", "population", "ID", "town", "nearest_station", "date", "postcode","postcode_short", "local_aut")
result <- Boruta(price ~., na.omit(train_data))
imp <- as.data.frame(attStats(result))
b_out <- as.data.frame(result$ImpHistory)
decision <- as.data.frame(result$finalDecision)
decision$feature <- row.names(decision)
colnames(decision) <- c("decision","feature")
b_out <- pivot_longer(b_out, cols = 1:39, names_to = "features")
b_out <- left_join(b_out, decision, by = c("features" = "feature"))
# plot(result, cex.axis=.7, las=2, xlab="", main="Variable Importance")
b_out %>%
filter(!features %in% c("shadowMean","shadowMin","shadowMax")) %>%
ggplot(aes(x = value,
y = reorder(features, value),
fill = decision))+
geom_boxplot()+
theme_minimal()+
labs(y = "",
x = "importance")+
scale_fill_manual(values = c("yellow", "green", "red"))## Warning: Removed 1352 rows containing non-finite values (stat_boxplot).
# we can check variable importance as well
var_importance(model = model2_lm, "Linear Regression")Below I use the predict function to test the performance of the model in testing data and summarize the performance of the linear regression model. How can you measure the quality of your predictions?
# We can predict the testing values
m_comp_table(model1_lm, "model1_lm")## # A tibble: 1 × 3
## model_name RMSE Rsquare
## <chr> <dbl> <dbl>
## 1 model1_lm 440579. 0.178
m_comp_table(model2_lm, "model2_lm")## # A tibble: 2 × 3
## model_name RMSE Rsquare
## <chr> <dbl> <dbl>
## 1 model1_lm 440579. 0.178
## 2 model2_lm 257836. 0.720
# predictions <- predict(model1_lm,test_data)
#
# lr_results<-data.frame(
# model_name = "model1_lm",
# RMSE = RMSE(predictions, test_data$price),
# Rsquare = R2(predictions, test_data$price)
# )
#
#
# lr_results
#
# predictions <- predict(model2_lm,test_data)
#
# lr_results<-lr_results %>%
# add_row(model_name = "model2_lm",
# RMSE = RMSE(predictions, test_data$price),
# Rsquare = R2(predictions, test_data$price))
#
#
# lr_results
#We can predict prices for out of sample data the same way
predictions_oos <- predict(model1_lm,london_house_prices_2019_out_of_sample)Next I fit a tree model using the same subset of features. Again you can (and should) add more variables and tune the parameter of your tree to find a better fit.
Compare the performance of the linear regression model with the tree model; which one performs better? Why do you think that is the case?
set.seed(123)
control <- trainControl (
method="cv",
number=5,
verboseIter=TRUE)
model1_tree <- train(
price ~ distance_to_station +water_company+property_type+whether_old_or_new+latitude+ longitude,
train_data,
method = "rpart",
trControl = control,
tuneLength=10
)## + Fold1: cp=0.01036
## - Fold1: cp=0.01036
## + Fold2: cp=0.01036
## - Fold2: cp=0.01036
## + Fold3: cp=0.01036
## - Fold3: cp=0.01036
## + Fold4: cp=0.01036
## - Fold4: cp=0.01036
## + Fold5: cp=0.01036
## - Fold5: cp=0.01036
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0104 on full training set
#You can view how the tree performs
model1_tree$results %>%
slice_max(order_by = RMSE, n=5)## cp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.05317292 495882.0 0.1504787 262408.2 47627.21 0.04678605 17660.887
## 2 0.05307855 487393.1 0.1522130 255986.3 47543.25 0.04070308 11617.939
## 3 0.04525601 472418.1 0.2088170 249662.1 38908.44 0.07165916 7165.348
## 4 0.03643516 458310.3 0.2611621 241849.6 44879.89 0.09844137 8490.009
## 5 0.02857529 453340.1 0.2783804 240079.2 40780.95 0.08953237 6978.154
## 6 0.03029526 453340.1 0.2783804 240079.2 40780.95 0.08953237 6978.154
#You can view the final tree
rpart.plot(model1_tree$finalModel)#you can also visualize the variable importance
importance <- varImp(model1_tree, scale=TRUE)
plot(importance)model2_tree <- train(
price ~ average_income:london_zone+
district +
water_company+
property_type+
freehold_or_leasehold+
latitude*longitude*total_floor_area*altitude+
average_income:number_habitable_rooms:total_floor_area+
I(average_income^3)+
energy_consumption_current+
energy_consumption_potential+
windows_energy_eff+
co2_emissions_current:co2_emissions_potential+
current_energy_rating+
distance_to_station+
num_tube_lines+
type_of_closest_station+
num_rail_lines+
num_light_rail_lines
,
train_data,
method = "rpart",
trControl = control,
metric = "RMSE",
tuneLength=15,
tuneGrid = expand.grid(cp = seq(0.00001,0.0002, 0.00001))
)## + Fold1: cp=1e-05
## - Fold1: cp=1e-05
## + Fold2: cp=1e-05
## - Fold2: cp=1e-05
## + Fold3: cp=1e-05
## - Fold3: cp=1e-05
## + Fold4: cp=1e-05
## - Fold4: cp=1e-05
## + Fold5: cp=1e-05
## - Fold5: cp=1e-05
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 3e-05 on full training set
colnames(train_data)## [1] "ID" "date"
## [3] "postcode" "property_type"
## [5] "whether_old_or_new" "freehold_or_leasehold"
## [7] "address1" "address2"
## [9] "address3" "town"
## [11] "local_aut" "county"
## [13] "postcode_short" "current_energy_rating"
## [15] "total_floor_area" "number_habitable_rooms"
## [17] "co2_emissions_current" "co2_emissions_potential"
## [19] "energy_consumption_current" "energy_consumption_potential"
## [21] "windows_energy_eff" "tenure"
## [23] "latitude" "longitude"
## [25] "population" "altitude"
## [27] "london_zone" "nearest_station"
## [29] "water_company" "average_income"
## [31] "district" "price"
## [33] "type_of_closest_station" "num_tube_lines"
## [35] "num_rail_lines" "num_light_rail_lines"
## [37] "distance_to_station"
#You can view how the tree performs
model2_tree$results%>%
slice_max(order_by = RMSE, n=5)## cp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.00020 256063.9 0.7689396 128793.1 21786.96 0.01163043 4620.483
## 2 0.00014 256031.8 0.7693208 127616.1 21062.35 0.01129435 4046.326
## 3 0.00017 255997.9 0.7691948 128318.0 21647.15 0.01162158 4607.306
## 4 0.00012 255957.8 0.7695558 126858.9 20934.25 0.01110654 3550.643
## 5 0.00016 255913.2 0.7694385 128015.3 21553.29 0.01109444 4463.192
m_comp_table(model2_tree, "model2_tree")## # A tibble: 3 × 3
## model_name RMSE Rsquare
## <chr> <dbl> <dbl>
## 1 model1_lm 440579. 0.178
## 2 model2_lm 257836. 0.720
## 3 model2_tree 225941. 0.786
var_importance(model2_tree, "Tree Model")Use at least two other algorithms to predict prices. Don’t forget to tune the parameters of these algorithms. And then compare the performances of your algorithms to linear regression and trees.
set.seed(123)
model1_rft <- train(
price ~ average_income:london_zone+
district +
water_company+
property_type+
freehold_or_leasehold+
latitude*longitude*total_floor_area*altitude+
average_income:number_habitable_rooms:total_floor_area+
I(average_income^3)+
energy_consumption_current+
energy_consumption_potential+
windows_energy_eff+
co2_emissions_current:co2_emissions_potential+
current_energy_rating+
distance_to_station+
num_tube_lines+
type_of_closest_station+
num_rail_lines+
num_light_rail_lines
,
train_data,
method = "ranger",
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE),
tuneGrid = expand.grid(.mtry = c(17:20),
.splitrule = "variance",
.min.node.size = c(1:4)),
importance = "impurity"
)## + Fold1: mtry=17, splitrule=variance, min.node.size=1
## - Fold1: mtry=17, splitrule=variance, min.node.size=1
## + Fold1: mtry=18, splitrule=variance, min.node.size=1
## - Fold1: mtry=18, splitrule=variance, min.node.size=1
## + Fold1: mtry=19, splitrule=variance, min.node.size=1
## - Fold1: mtry=19, splitrule=variance, min.node.size=1
## + Fold1: mtry=20, splitrule=variance, min.node.size=1
## - Fold1: mtry=20, splitrule=variance, min.node.size=1
## + Fold1: mtry=17, splitrule=variance, min.node.size=2
## - Fold1: mtry=17, splitrule=variance, min.node.size=2
## + Fold1: mtry=18, splitrule=variance, min.node.size=2
## - Fold1: mtry=18, splitrule=variance, min.node.size=2
## + Fold1: mtry=19, splitrule=variance, min.node.size=2
## - Fold1: mtry=19, splitrule=variance, min.node.size=2
## + Fold1: mtry=20, splitrule=variance, min.node.size=2
## - Fold1: mtry=20, splitrule=variance, min.node.size=2
## + Fold1: mtry=17, splitrule=variance, min.node.size=3
## - Fold1: mtry=17, splitrule=variance, min.node.size=3
## + Fold1: mtry=18, splitrule=variance, min.node.size=3
## - Fold1: mtry=18, splitrule=variance, min.node.size=3
## + Fold1: mtry=19, splitrule=variance, min.node.size=3
## - Fold1: mtry=19, splitrule=variance, min.node.size=3
## + Fold1: mtry=20, splitrule=variance, min.node.size=3
## - Fold1: mtry=20, splitrule=variance, min.node.size=3
## + Fold1: mtry=17, splitrule=variance, min.node.size=4
## - Fold1: mtry=17, splitrule=variance, min.node.size=4
## + Fold1: mtry=18, splitrule=variance, min.node.size=4
## - Fold1: mtry=18, splitrule=variance, min.node.size=4
## + Fold1: mtry=19, splitrule=variance, min.node.size=4
## - Fold1: mtry=19, splitrule=variance, min.node.size=4
## + Fold1: mtry=20, splitrule=variance, min.node.size=4
## - Fold1: mtry=20, splitrule=variance, min.node.size=4
## + Fold2: mtry=17, splitrule=variance, min.node.size=1
## - Fold2: mtry=17, splitrule=variance, min.node.size=1
## + Fold2: mtry=18, splitrule=variance, min.node.size=1
## - Fold2: mtry=18, splitrule=variance, min.node.size=1
## + Fold2: mtry=19, splitrule=variance, min.node.size=1
## - Fold2: mtry=19, splitrule=variance, min.node.size=1
## + Fold2: mtry=20, splitrule=variance, min.node.size=1
## - Fold2: mtry=20, splitrule=variance, min.node.size=1
## + Fold2: mtry=17, splitrule=variance, min.node.size=2
## - Fold2: mtry=17, splitrule=variance, min.node.size=2
## + Fold2: mtry=18, splitrule=variance, min.node.size=2
## - Fold2: mtry=18, splitrule=variance, min.node.size=2
## + Fold2: mtry=19, splitrule=variance, min.node.size=2
## - Fold2: mtry=19, splitrule=variance, min.node.size=2
## + Fold2: mtry=20, splitrule=variance, min.node.size=2
## - Fold2: mtry=20, splitrule=variance, min.node.size=2
## + Fold2: mtry=17, splitrule=variance, min.node.size=3
## - Fold2: mtry=17, splitrule=variance, min.node.size=3
## + Fold2: mtry=18, splitrule=variance, min.node.size=3
## - Fold2: mtry=18, splitrule=variance, min.node.size=3
## + Fold2: mtry=19, splitrule=variance, min.node.size=3
## - Fold2: mtry=19, splitrule=variance, min.node.size=3
## + Fold2: mtry=20, splitrule=variance, min.node.size=3
## - Fold2: mtry=20, splitrule=variance, min.node.size=3
## + Fold2: mtry=17, splitrule=variance, min.node.size=4
## - Fold2: mtry=17, splitrule=variance, min.node.size=4
## + Fold2: mtry=18, splitrule=variance, min.node.size=4
## - Fold2: mtry=18, splitrule=variance, min.node.size=4
## + Fold2: mtry=19, splitrule=variance, min.node.size=4
## - Fold2: mtry=19, splitrule=variance, min.node.size=4
## + Fold2: mtry=20, splitrule=variance, min.node.size=4
## - Fold2: mtry=20, splitrule=variance, min.node.size=4
## + Fold3: mtry=17, splitrule=variance, min.node.size=1
## - Fold3: mtry=17, splitrule=variance, min.node.size=1
## + Fold3: mtry=18, splitrule=variance, min.node.size=1
## - Fold3: mtry=18, splitrule=variance, min.node.size=1
## + Fold3: mtry=19, splitrule=variance, min.node.size=1
## - Fold3: mtry=19, splitrule=variance, min.node.size=1
## + Fold3: mtry=20, splitrule=variance, min.node.size=1
## - Fold3: mtry=20, splitrule=variance, min.node.size=1
## + Fold3: mtry=17, splitrule=variance, min.node.size=2
## - Fold3: mtry=17, splitrule=variance, min.node.size=2
## + Fold3: mtry=18, splitrule=variance, min.node.size=2
## - Fold3: mtry=18, splitrule=variance, min.node.size=2
## + Fold3: mtry=19, splitrule=variance, min.node.size=2
## - Fold3: mtry=19, splitrule=variance, min.node.size=2
## + Fold3: mtry=20, splitrule=variance, min.node.size=2
## - Fold3: mtry=20, splitrule=variance, min.node.size=2
## + Fold3: mtry=17, splitrule=variance, min.node.size=3
## - Fold3: mtry=17, splitrule=variance, min.node.size=3
## + Fold3: mtry=18, splitrule=variance, min.node.size=3
## - Fold3: mtry=18, splitrule=variance, min.node.size=3
## + Fold3: mtry=19, splitrule=variance, min.node.size=3
## - Fold3: mtry=19, splitrule=variance, min.node.size=3
## + Fold3: mtry=20, splitrule=variance, min.node.size=3
## - Fold3: mtry=20, splitrule=variance, min.node.size=3
## + Fold3: mtry=17, splitrule=variance, min.node.size=4
## - Fold3: mtry=17, splitrule=variance, min.node.size=4
## + Fold3: mtry=18, splitrule=variance, min.node.size=4
## - Fold3: mtry=18, splitrule=variance, min.node.size=4
## + Fold3: mtry=19, splitrule=variance, min.node.size=4
## - Fold3: mtry=19, splitrule=variance, min.node.size=4
## + Fold3: mtry=20, splitrule=variance, min.node.size=4
## - Fold3: mtry=20, splitrule=variance, min.node.size=4
## + Fold4: mtry=17, splitrule=variance, min.node.size=1
## - Fold4: mtry=17, splitrule=variance, min.node.size=1
## + Fold4: mtry=18, splitrule=variance, min.node.size=1
## - Fold4: mtry=18, splitrule=variance, min.node.size=1
## + Fold4: mtry=19, splitrule=variance, min.node.size=1
## - Fold4: mtry=19, splitrule=variance, min.node.size=1
## + Fold4: mtry=20, splitrule=variance, min.node.size=1
## - Fold4: mtry=20, splitrule=variance, min.node.size=1
## + Fold4: mtry=17, splitrule=variance, min.node.size=2
## - Fold4: mtry=17, splitrule=variance, min.node.size=2
## + Fold4: mtry=18, splitrule=variance, min.node.size=2
## - Fold4: mtry=18, splitrule=variance, min.node.size=2
## + Fold4: mtry=19, splitrule=variance, min.node.size=2
## - Fold4: mtry=19, splitrule=variance, min.node.size=2
## + Fold4: mtry=20, splitrule=variance, min.node.size=2
## - Fold4: mtry=20, splitrule=variance, min.node.size=2
## + Fold4: mtry=17, splitrule=variance, min.node.size=3
## - Fold4: mtry=17, splitrule=variance, min.node.size=3
## + Fold4: mtry=18, splitrule=variance, min.node.size=3
## - Fold4: mtry=18, splitrule=variance, min.node.size=3
## + Fold4: mtry=19, splitrule=variance, min.node.size=3
## - Fold4: mtry=19, splitrule=variance, min.node.size=3
## + Fold4: mtry=20, splitrule=variance, min.node.size=3
## - Fold4: mtry=20, splitrule=variance, min.node.size=3
## + Fold4: mtry=17, splitrule=variance, min.node.size=4
## - Fold4: mtry=17, splitrule=variance, min.node.size=4
## + Fold4: mtry=18, splitrule=variance, min.node.size=4
## - Fold4: mtry=18, splitrule=variance, min.node.size=4
## + Fold4: mtry=19, splitrule=variance, min.node.size=4
## - Fold4: mtry=19, splitrule=variance, min.node.size=4
## + Fold4: mtry=20, splitrule=variance, min.node.size=4
## - Fold4: mtry=20, splitrule=variance, min.node.size=4
## + Fold5: mtry=17, splitrule=variance, min.node.size=1
## - Fold5: mtry=17, splitrule=variance, min.node.size=1
## + Fold5: mtry=18, splitrule=variance, min.node.size=1
## - Fold5: mtry=18, splitrule=variance, min.node.size=1
## + Fold5: mtry=19, splitrule=variance, min.node.size=1
## - Fold5: mtry=19, splitrule=variance, min.node.size=1
## + Fold5: mtry=20, splitrule=variance, min.node.size=1
## - Fold5: mtry=20, splitrule=variance, min.node.size=1
## + Fold5: mtry=17, splitrule=variance, min.node.size=2
## - Fold5: mtry=17, splitrule=variance, min.node.size=2
## + Fold5: mtry=18, splitrule=variance, min.node.size=2
## - Fold5: mtry=18, splitrule=variance, min.node.size=2
## + Fold5: mtry=19, splitrule=variance, min.node.size=2
## - Fold5: mtry=19, splitrule=variance, min.node.size=2
## + Fold5: mtry=20, splitrule=variance, min.node.size=2
## - Fold5: mtry=20, splitrule=variance, min.node.size=2
## + Fold5: mtry=17, splitrule=variance, min.node.size=3
## - Fold5: mtry=17, splitrule=variance, min.node.size=3
## + Fold5: mtry=18, splitrule=variance, min.node.size=3
## - Fold5: mtry=18, splitrule=variance, min.node.size=3
## + Fold5: mtry=19, splitrule=variance, min.node.size=3
## - Fold5: mtry=19, splitrule=variance, min.node.size=3
## + Fold5: mtry=20, splitrule=variance, min.node.size=3
## - Fold5: mtry=20, splitrule=variance, min.node.size=3
## + Fold5: mtry=17, splitrule=variance, min.node.size=4
## - Fold5: mtry=17, splitrule=variance, min.node.size=4
## + Fold5: mtry=18, splitrule=variance, min.node.size=4
## - Fold5: mtry=18, splitrule=variance, min.node.size=4
## + Fold5: mtry=19, splitrule=variance, min.node.size=4
## - Fold5: mtry=19, splitrule=variance, min.node.size=4
## + Fold5: mtry=20, splitrule=variance, min.node.size=4
## - Fold5: mtry=20, splitrule=variance, min.node.size=4
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 17, splitrule = variance, min.node.size = 4 on full training set
# maxstat performce worse than varience in this case
# model3_rft <- train(
# price ~ average_income:london_zone+
# district +
# water_company+
# property_type+
# freehold_or_leasehold+
# latitude*longitude*total_floor_area*altitude+
# average_income:number_habitable_rooms:total_floor_area+
# I(average_income^3)+
# energy_consumption_current+
# energy_consumption_potential+
# windows_energy_eff+
# co2_emissions_current:co2_emissions_potential+
# current_energy_rating+
# distance_to_station+
# num_tube_lines+
# type_of_closest_station+
# num_rail_lines+
# num_light_rail_lines
# ,
# train_data,
# method = "ranger",
# trControl = trainControl(method = "cv",
# number = 5,
# verboseIter = TRUE),
# tuneGrid = expand.grid(.mtry = c(13:15),
# .splitrule = "maxstat",
# .min.node.size = c(1:4)),
# importance = "impurity"
# )
#just to check that passing a features interactions to the tree is the better approach
# model2_rft <- train(
# price ~ .,
# na.action = na.omit,
# train_data,
# method = "ranger",
# trControl = trainControl(method = "cv",
# number = 5,
# verboseIter = TRUE),
# tuneGrid = expand.grid(.mtry = c(2:6),
# .splitrule = "variance",
# .min.node.size = c(6:8)),
# importance = "impurity"
#
# )
# model2_rft$results %>%
# arrange(RMSE)
# model3_rft$results %>%
# arrange(RMSE)
model1_rft$results %>%
slice_max(order_by = RMSE, n=5)## mtry splitrule min.node.size RMSE Rsquared MAE RMSESD RsquaredSD
## 1 20 variance 4 221971.8 0.8276500 100401.3 19702.93 0.02550171
## 2 20 variance 2 221962.3 0.8276764 100570.2 18056.77 0.02465326
## 3 19 variance 2 221802.0 0.8282074 100406.4 19646.47 0.02361816
## 4 18 variance 4 221628.4 0.8291604 100273.3 19155.31 0.02240415
## 5 17 variance 3 221559.0 0.8294201 100167.0 19952.90 0.02338611
## MAESD
## 1 2746.202
## 2 2784.555
## 3 2589.030
## 4 2626.593
## 5 2604.842
m_comp_table(model1_rft, "model1_rft")## # A tibble: 4 × 3
## model_name RMSE Rsquare
## <chr> <dbl> <dbl>
## 1 model1_lm 440579. 0.178
## 2 model2_lm 257836. 0.720
## 3 model2_tree 225941. 0.786
## 4 model1_rft 177126. 0.868
var_importance(model1_rft, "Random Forest Tree")set.seed(123)
library(gbm)
modelLookup("gbm")## model parameter label forReg forClass probModel
## 1 gbm n.trees # Boosting Iterations TRUE TRUE TRUE
## 2 gbm interaction.depth Max Tree Depth TRUE TRUE TRUE
## 3 gbm shrinkage Shrinkage TRUE TRUE TRUE
## 4 gbm n.minobsinnode Min. Terminal Node Size TRUE TRUE TRUE
model1_gbm <- train(
price ~ average_income:london_zone+
district +
water_company+
property_type+
freehold_or_leasehold+
latitude*longitude*total_floor_area*altitude+
average_income:number_habitable_rooms:total_floor_area+
I(average_income^3)+
energy_consumption_current+
energy_consumption_potential+
windows_energy_eff+
co2_emissions_current:co2_emissions_potential+
current_energy_rating+
distance_to_station+
num_tube_lines+
type_of_closest_station+
num_rail_lines+
num_light_rail_lines
,
train_data,
method = "gbm",
metric = "RMSE",
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE),
# used for last tune iteration, commented out for quicker knitting performance
# tuneGrid = expand.grid(interaction.depth = c(5,6,7),
# n.trees = 1500,
# shrinkage = 0.1,
# n.minobsinnode = c(18:22)),
tuneGrid = expand.grid(interaction.depth = c(6),
n.trees = 1500,
shrinkage = 0.1,
n.minobsinnode = c(20)),
)## + Fold1: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 245770942120.5868 nan 0.1000 27913796306.0575
## 2 220496033378.4749 nan 0.1000 23587396757.6876
## 3 198966755141.3963 nan 0.1000 22184458610.1006
## 4 180733569550.6752 nan 0.1000 17862247312.6087
## 5 163942965707.7488 nan 0.1000 17301614166.0010
## 6 151235454879.0226 nan 0.1000 12142948585.8603
## 7 138643471157.6980 nan 0.1000 10841245188.8486
## 8 129122314103.8701 nan 0.1000 9399578778.2967
## 9 120752834748.3752 nan 0.1000 7222045676.7308
## 10 112878933209.0992 nan 0.1000 6755349429.3113
## 20 72081873171.3212 nan 0.1000 1562369746.8483
## 40 49970978523.8838 nan 0.1000 23261474.0351
## 60 43178118596.2501 nan 0.1000 296932718.6116
## 80 39275074130.5166 nan 0.1000 -116738323.3194
## 100 35507742615.8260 nan 0.1000 122946900.2753
## 120 33357661438.1293 nan 0.1000 -125216495.8915
## 140 31358277183.1927 nan 0.1000 -83498124.5453
## 160 29146002839.1633 nan 0.1000 -49649554.8310
## 180 27640466817.2677 nan 0.1000 -138133593.8321
## 200 26163541653.2510 nan 0.1000 -99989733.7549
## 220 24807547896.8489 nan 0.1000 -148287769.2944
## 240 23604761820.4492 nan 0.1000 -43472201.2617
## 260 22721712099.2540 nan 0.1000 -91696125.4864
## 280 21853202761.9176 nan 0.1000 -128438495.5682
## 300 21105412840.2740 nan 0.1000 -64732405.1891
## 320 20369975943.3323 nan 0.1000 -49377872.7928
## 340 19585934258.2707 nan 0.1000 -26781695.8645
## 360 18998864693.2778 nan 0.1000 -16917984.7555
## 380 18378400579.6017 nan 0.1000 -20629246.3528
## 400 17765429272.0002 nan 0.1000 -65044820.7843
## 420 17231399804.9744 nan 0.1000 -504347.1026
## 440 16642349718.8895 nan 0.1000 -49166473.3428
## 460 16141121482.4456 nan 0.1000 -52221263.1869
## 480 15707143417.1551 nan 0.1000 -39066904.5090
## 500 15254781819.4885 nan 0.1000 -64173396.4893
## 520 14862546368.3428 nan 0.1000 -8809408.0675
## 540 14494824390.1476 nan 0.1000 -8973264.3410
## 560 14075317400.3562 nan 0.1000 -46477372.9682
## 580 13697668515.5735 nan 0.1000 -47006651.6873
## 600 13374464779.0017 nan 0.1000 -16145803.5921
## 620 13111262006.6882 nan 0.1000 -34403483.1437
## 640 12824220639.4832 nan 0.1000 -35040854.5752
## 660 12524804626.7020 nan 0.1000 -26928838.9049
## 680 12216233039.4002 nan 0.1000 -20906999.5960
## 700 11911728946.7618 nan 0.1000 -36074481.0968
## 720 11710161797.0626 nan 0.1000 -25591743.9733
## 740 11492594880.7312 nan 0.1000 -4051902.2704
## 760 11286506519.1820 nan 0.1000 -8726641.5409
## 780 11063408677.1540 nan 0.1000 -17489186.2041
## 800 10844352435.1395 nan 0.1000 -9300993.3952
## 820 10636432935.0533 nan 0.1000 -15293312.7636
## 840 10454097097.5970 nan 0.1000 -7268132.3968
## 860 10246851140.4589 nan 0.1000 -21105804.2472
## 880 10074222250.6038 nan 0.1000 -21762233.8160
## 900 9884467049.5007 nan 0.1000 -5876451.0479
## 920 9686288039.0172 nan 0.1000 -25972137.5484
## 940 9522499908.1136 nan 0.1000 -13310390.7206
## 960 9360211681.2470 nan 0.1000 -6567911.5318
## 980 9227738221.0513 nan 0.1000 -11332123.0432
## 1000 9094978735.0635 nan 0.1000 -8708091.1827
## 1020 8946712087.1707 nan 0.1000 -4469403.4344
## 1040 8821845251.7227 nan 0.1000 -22197304.1709
## 1060 8661356368.0037 nan 0.1000 -3572213.4257
## 1080 8526704466.7052 nan 0.1000 -9952376.1314
## 1100 8384468875.2876 nan 0.1000 -18328275.5986
## 1120 8265343553.4545 nan 0.1000 -12558735.9572
## 1140 8132146263.3720 nan 0.1000 -18028961.5244
## 1160 8008120378.2614 nan 0.1000 -12830345.1944
## 1180 7875030468.5083 nan 0.1000 -11566487.5912
## 1200 7765530522.2643 nan 0.1000 -13309168.7427
## 1220 7645411193.3166 nan 0.1000 -10038084.4163
## 1240 7547856646.9370 nan 0.1000 -6031852.6090
## 1260 7438506456.9779 nan 0.1000 -11980314.2594
## 1280 7336253865.5108 nan 0.1000 -14460199.2270
## 1300 7248670294.9782 nan 0.1000 -2795811.3005
## 1320 7148261198.8695 nan 0.1000 -9099666.9398
## 1340 7053707381.4279 nan 0.1000 -5239247.0050
## 1360 6977759084.4658 nan 0.1000 -8119083.6796
## 1380 6886956994.1580 nan 0.1000 -10561425.7479
## 1400 6776129125.5583 nan 0.1000 -14752251.6349
## 1420 6699363114.6455 nan 0.1000 -7982151.9215
## 1440 6591587374.1673 nan 0.1000 -2436253.8460
## 1460 6505841436.1119 nan 0.1000 -10070641.0214
## 1480 6432537884.0524 nan 0.1000 -11327314.1636
## 1500 6349248105.8138 nan 0.1000 -175780.2588
##
## - Fold1: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## + Fold2: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 252857429558.2422 nan 0.1000 32735688858.4627
## 2 229468584328.0307 nan 0.1000 25580922115.4166
## 3 206098392458.1672 nan 0.1000 22169394817.5772
## 4 185897050671.4452 nan 0.1000 19937146918.7795
## 5 170279506581.7034 nan 0.1000 16959883767.7377
## 6 154775454690.6626 nan 0.1000 14150144404.8932
## 7 142508207849.8500 nan 0.1000 10047940604.3013
## 8 131496770470.9434 nan 0.1000 11054701881.1596
## 9 122147330639.3122 nan 0.1000 7993155351.5058
## 10 116147916969.7818 nan 0.1000 6437165111.1032
## 20 72345352376.6985 nan 0.1000 2024729729.4361
## 40 50640252350.1066 nan 0.1000 57604195.4140
## 60 43918485524.8905 nan 0.1000 96323082.5591
## 80 39418974643.8000 nan 0.1000 -87220126.2860
## 100 35843144497.9552 nan 0.1000 -42343101.6296
## 120 32938200097.8308 nan 0.1000 -124586755.7857
## 140 30909438302.4859 nan 0.1000 -127143813.6372
## 160 28881196704.1372 nan 0.1000 -94107300.6494
## 180 27582750241.9574 nan 0.1000 -69537911.1318
## 200 26221437972.5321 nan 0.1000 -159667125.5304
## 220 24852755080.5046 nan 0.1000 -159953412.6353
## 240 23801402471.4164 nan 0.1000 -83854284.7447
## 260 22805624133.8707 nan 0.1000 -23931013.6278
## 280 21816216491.4235 nan 0.1000 -31306504.4787
## 300 20945885843.4627 nan 0.1000 -110988187.9888
## 320 20180383875.7523 nan 0.1000 -65459971.1709
## 340 19353681940.2197 nan 0.1000 -92577956.1950
## 360 18801390960.5779 nan 0.1000 -69006069.8994
## 380 18102866762.8716 nan 0.1000 -58363557.1049
## 400 17588758737.1908 nan 0.1000 -2999271.0955
## 420 16976762110.0322 nan 0.1000 -64083928.9137
## 440 16385251372.0562 nan 0.1000 -10567943.2043
## 460 15949015378.0741 nan 0.1000 -67989512.5261
## 480 15505894691.1549 nan 0.1000 -43477108.9306
## 500 15019577855.9381 nan 0.1000 -27194234.3051
## 520 14659528320.6875 nan 0.1000 -15104760.2202
## 540 14235884749.7819 nan 0.1000 -59130766.6192
## 560 13886643972.1778 nan 0.1000 -44438927.2386
## 580 13557812215.3412 nan 0.1000 -26523762.6144
## 600 13223419904.0430 nan 0.1000 -56679661.3885
## 620 12899294046.7376 nan 0.1000 -25064981.9112
## 640 12628214905.8742 nan 0.1000 -45651158.9810
## 660 12344650406.4795 nan 0.1000 -27809545.3596
## 680 12129844134.8853 nan 0.1000 -28815051.4667
## 700 11895466286.2311 nan 0.1000 -21161292.7828
## 720 11597235679.2647 nan 0.1000 -8044195.0805
## 740 11355972838.0057 nan 0.1000 -27882861.4616
## 760 11152824144.5676 nan 0.1000 -24390091.9521
## 780 10949139404.6353 nan 0.1000 -21384522.6828
## 800 10716121737.7847 nan 0.1000 -8101743.2226
## 820 10507853228.1943 nan 0.1000 -27103143.1548
## 840 10289424270.4454 nan 0.1000 -9849867.5129
## 860 10050413876.4460 nan 0.1000 5715191.2664
## 880 9876309201.4917 nan 0.1000 -14786651.4663
## 900 9676723997.4276 nan 0.1000 -10007361.3952
## 920 9527582153.6123 nan 0.1000 -22102277.9494
## 940 9379129117.5144 nan 0.1000 -19425456.2314
## 960 9225525327.3449 nan 0.1000 -4423472.2728
## 980 9076794993.1452 nan 0.1000 -22174810.2043
## 1000 8914687821.7676 nan 0.1000 -10784980.0297
## 1020 8779558179.3035 nan 0.1000 -20667559.0360
## 1040 8628320680.5105 nan 0.1000 -10130314.5078
## 1060 8480480861.8154 nan 0.1000 -12174810.1463
## 1080 8337627155.0999 nan 0.1000 -17681711.8493
## 1100 8229116928.7854 nan 0.1000 -11937210.0024
## 1120 8083438059.4408 nan 0.1000 -6483299.4968
## 1140 7952954109.7680 nan 0.1000 -9999701.9062
## 1160 7824012826.8584 nan 0.1000 -12374564.7432
## 1180 7709662991.2529 nan 0.1000 -12915983.0760
## 1200 7596739021.1896 nan 0.1000 -8360172.4120
## 1220 7498089028.7050 nan 0.1000 -9773183.7165
## 1240 7370815105.7835 nan 0.1000 -13182683.8033
## 1260 7253428603.3197 nan 0.1000 -9367658.7112
## 1280 7163705376.8199 nan 0.1000 -11428093.9205
## 1300 7061921352.1590 nan 0.1000 -8996439.5058
## 1320 6968155337.4887 nan 0.1000 -13543645.0455
## 1340 6865769111.1607 nan 0.1000 -9104220.3707
## 1360 6786229424.5949 nan 0.1000 -7072291.1963
## 1380 6693682790.6126 nan 0.1000 -11976274.3994
## 1400 6605499300.6163 nan 0.1000 -9457292.5857
## 1420 6525854641.5088 nan 0.1000 -12928030.0047
## 1440 6437667789.3327 nan 0.1000 -6851750.4677
## 1460 6348165334.0085 nan 0.1000 -8934079.5662
## 1480 6284704555.9506 nan 0.1000 -4465894.7212
## 1500 6191164313.2527 nan 0.1000 -11899523.6536
##
## - Fold2: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## + Fold3: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 235608621507.4449 nan 0.1000 30879376622.0596
## 2 208734544162.3782 nan 0.1000 23429114145.1144
## 3 187002454880.4561 nan 0.1000 18713664652.4040
## 4 168593826613.0860 nan 0.1000 18553628644.7405
## 5 153119205052.7050 nan 0.1000 13860784192.3468
## 6 140667420309.0081 nan 0.1000 10431261374.8487
## 7 130310115065.9453 nan 0.1000 9566396123.9262
## 8 120680600993.8784 nan 0.1000 8403928031.9462
## 9 113100764300.2584 nan 0.1000 7482748979.1155
## 10 105584699404.8595 nan 0.1000 6439948843.5637
## 20 65304038550.5719 nan 0.1000 2264420076.8587
## 40 45426423371.5475 nan 0.1000 103639880.3076
## 60 39539452915.8119 nan 0.1000 -178141731.7727
## 80 35641149382.9817 nan 0.1000 -110273663.9535
## 100 33039246560.3768 nan 0.1000 -223186652.3513
## 120 31015811543.1256 nan 0.1000 23229248.4652
## 140 29255934961.2150 nan 0.1000 -44005952.6664
## 160 27929602487.9932 nan 0.1000 -116056115.9250
## 180 26221187918.5628 nan 0.1000 -138551156.9945
## 200 24813167557.2311 nan 0.1000 -121224328.2081
## 220 23698710774.5351 nan 0.1000 -181190141.0021
## 240 22796516940.4732 nan 0.1000 -100163927.1998
## 260 21836735031.2765 nan 0.1000 -22284261.3563
## 280 20940644111.0213 nan 0.1000 -114239380.8621
## 300 20222711382.0694 nan 0.1000 -25092641.5378
## 320 19558602618.3870 nan 0.1000 -11102116.4520
## 340 18899481429.8760 nan 0.1000 -42975977.0068
## 360 18318898753.8556 nan 0.1000 -46283423.9858
## 380 17593644170.8995 nan 0.1000 -82194935.6588
## 400 17070389437.7588 nan 0.1000 -100905930.8390
## 420 16475620408.9516 nan 0.1000 -26417881.8018
## 440 16072810550.0685 nan 0.1000 -63183971.0065
## 460 15665893177.1986 nan 0.1000 -57536129.9529
## 480 15121523005.3598 nan 0.1000 -60597733.4384
## 500 14718953176.8690 nan 0.1000 -18917685.8216
## 520 14348613213.9491 nan 0.1000 -22795656.5181
## 540 14001815499.7583 nan 0.1000 -65812352.8997
## 560 13689887690.6387 nan 0.1000 -34023449.0318
## 580 13417133061.3824 nan 0.1000 -23938059.7949
## 600 13099378720.6788 nan 0.1000 -36176574.3557
## 620 12766254103.6985 nan 0.1000 -16861317.5598
## 640 12481921295.0686 nan 0.1000 -15042500.4106
## 660 12215530252.7210 nan 0.1000 -30591546.1112
## 680 11991277847.5828 nan 0.1000 -29115148.3042
## 700 11777399131.4786 nan 0.1000 -13733274.4323
## 720 11504236239.2807 nan 0.1000 -28599172.9043
## 740 11246621345.7191 nan 0.1000 -31590828.1279
## 760 11077986344.4351 nan 0.1000 -20915175.1246
## 780 10863902263.8366 nan 0.1000 -32359160.5377
## 800 10641729262.3869 nan 0.1000 -8774988.8009
## 820 10454166933.3583 nan 0.1000 -9980628.0766
## 840 10250072314.4415 nan 0.1000 -13107074.3648
## 860 10055983305.6458 nan 0.1000 -17721136.7290
## 880 9869314177.0026 nan 0.1000 -18960059.6304
## 900 9702313748.1725 nan 0.1000 -13790752.0034
## 920 9535127946.6023 nan 0.1000 -17530541.1853
## 940 9393446267.0614 nan 0.1000 -10945632.4799
## 960 9245539909.0083 nan 0.1000 -1632329.0114
## 980 9051905752.5273 nan 0.1000 -7643159.3925
## 1000 8900380221.9970 nan 0.1000 -17264218.2269
## 1020 8739507053.8316 nan 0.1000 -11262136.3240
## 1040 8602726838.7925 nan 0.1000 -6100143.1095
## 1060 8448577662.9514 nan 0.1000 -12803325.5422
## 1080 8325955447.7055 nan 0.1000 -5180944.9995
## 1100 8194565490.0883 nan 0.1000 -4826225.4902
## 1120 8064295122.2238 nan 0.1000 -6965175.6429
## 1140 7952050720.0703 nan 0.1000 -6318720.9564
## 1160 7860911899.1258 nan 0.1000 -18210746.8427
## 1180 7726068075.7169 nan 0.1000 -9008557.5669
## 1200 7598789329.7159 nan 0.1000 -12707163.6990
## 1220 7478159390.8327 nan 0.1000 -13821965.8951
## 1240 7370945981.1519 nan 0.1000 -10591491.5532
## 1260 7287146782.6365 nan 0.1000 -19667740.9926
## 1280 7182076632.0399 nan 0.1000 -8172292.3736
## 1300 7067291602.8504 nan 0.1000 -6676594.0747
## 1320 6948372843.3820 nan 0.1000 -4943020.6442
## 1340 6859579217.0818 nan 0.1000 -12663843.5063
## 1360 6763674225.4558 nan 0.1000 -4870933.7955
## 1380 6680331082.7265 nan 0.1000 -10140287.7247
## 1400 6573388978.4136 nan 0.1000 -11518827.8977
## 1420 6477176010.2009 nan 0.1000 -7419540.2017
## 1440 6391157899.5416 nan 0.1000 -8000024.2128
## 1460 6316135324.8865 nan 0.1000 -6815409.1134
## 1480 6238831742.5300 nan 0.1000 -6027450.4772
## 1500 6148680037.0266 nan 0.1000 -4622552.8879
##
## - Fold3: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## + Fold4: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 264257308377.5061 nan 0.1000 31872576935.6513
## 2 237038676812.4272 nan 0.1000 27721599868.8883
## 3 213063332917.4612 nan 0.1000 22705814586.8588
## 4 194074469286.7059 nan 0.1000 21496704599.1911
## 5 176340931711.8455 nan 0.1000 17258622875.9309
## 6 161352500095.3738 nan 0.1000 12817200118.5636
## 7 148939917148.8544 nan 0.1000 12636063294.5769
## 8 137905476261.8008 nan 0.1000 10967766599.5360
## 9 128162197952.5076 nan 0.1000 9354426600.4824
## 10 120558429370.1805 nan 0.1000 6535131463.4070
## 20 73553338418.2213 nan 0.1000 2021195526.9235
## 40 50951300899.9541 nan 0.1000 -117480712.1652
## 60 44333187133.5706 nan 0.1000 -239814055.1559
## 80 40098470609.2127 nan 0.1000 -223116231.6656
## 100 36790328211.1718 nan 0.1000 -201357393.1526
## 120 34291068285.3162 nan 0.1000 -214224529.4806
## 140 31739024869.2677 nan 0.1000 40870265.7196
## 160 29833008125.4851 nan 0.1000 -118753820.3395
## 180 28293986668.0983 nan 0.1000 -75579320.9921
## 200 26913949859.0155 nan 0.1000 -58395361.2838
## 220 25651243277.0665 nan 0.1000 -103580851.5644
## 240 24468036918.9617 nan 0.1000 26108538.4694
## 260 23592772315.5870 nan 0.1000 -153655338.9215
## 280 22723973212.6390 nan 0.1000 -32958476.5709
## 300 21907472023.9955 nan 0.1000 -71131557.4749
## 320 20997468280.1166 nan 0.1000 -75249176.1871
## 340 20145334288.0003 nan 0.1000 -72642675.8641
## 360 19559873196.3977 nan 0.1000 -60005757.8685
## 380 18908333436.4412 nan 0.1000 -76749890.2338
## 400 18332340378.7389 nan 0.1000 -83196145.5416
## 420 17805140512.6957 nan 0.1000 -53913391.1619
## 440 17357076876.0255 nan 0.1000 -100401562.8714
## 460 16902693938.4186 nan 0.1000 12781816.8015
## 480 16412630497.3127 nan 0.1000 -54386225.0617
## 500 15939880964.5230 nan 0.1000 -50276539.8653
## 520 15565815612.6316 nan 0.1000 -35453676.0309
## 540 15181205059.3327 nan 0.1000 -13266975.0613
## 560 14751774199.5894 nan 0.1000 -28375353.1755
## 580 14427574760.7005 nan 0.1000 -18890548.8383
## 600 14090199804.4023 nan 0.1000 -23498995.1007
## 620 13743526557.0678 nan 0.1000 -17873084.6156
## 640 13433628455.7297 nan 0.1000 -19771490.0120
## 660 13129514068.4503 nan 0.1000 -24531263.4997
## 680 12791159580.2464 nan 0.1000 -29846092.1625
## 700 12518406429.1571 nan 0.1000 -30630128.6148
## 720 12241447748.6456 nan 0.1000 -12487810.8429
## 740 12009017536.4799 nan 0.1000 -13471279.7996
## 760 11776067714.8529 nan 0.1000 -16873946.9805
## 780 11567757442.5193 nan 0.1000 -8197657.1976
## 800 11370992273.0722 nan 0.1000 -33400837.5155
## 820 11161835385.0842 nan 0.1000 -12169451.1019
## 840 10912474097.5016 nan 0.1000 -9449111.6072
## 860 10738083613.3072 nan 0.1000 -39743420.4008
## 880 10538215438.9139 nan 0.1000 -25655047.8908
## 900 10336542043.5726 nan 0.1000 -31229323.5323
## 920 10184292362.2181 nan 0.1000 -22295240.5969
## 940 10017153113.5308 nan 0.1000 -6608568.9243
## 960 9839834165.7336 nan 0.1000 -2624477.0066
## 980 9658618291.5527 nan 0.1000 -15036060.1448
## 1000 9507367388.4163 nan 0.1000 -11759917.3031
## 1020 9331173177.7255 nan 0.1000 -23813399.0917
## 1040 9188305053.0633 nan 0.1000 -27659352.8326
## 1060 9041969862.6090 nan 0.1000 -12382082.8732
## 1080 8923866014.3171 nan 0.1000 -21536670.3432
## 1100 8777449617.6638 nan 0.1000 -15440742.9321
## 1120 8642507647.2059 nan 0.1000 -12114239.8911
## 1140 8512927902.1575 nan 0.1000 -22252863.9092
## 1160 8355611174.5061 nan 0.1000 -13282951.4955
## 1180 8233589192.0746 nan 0.1000 -15856347.6160
## 1200 8104551083.0837 nan 0.1000 -8976577.1551
## 1220 7989440625.7464 nan 0.1000 -19906069.1559
## 1240 7852964707.8869 nan 0.1000 -9403029.0057
## 1260 7758040497.2032 nan 0.1000 -15020750.3617
## 1280 7650749099.7584 nan 0.1000 -7681261.2228
## 1300 7534546121.7128 nan 0.1000 -12378075.2040
## 1320 7428296015.9009 nan 0.1000 -9098032.3603
## 1340 7309810650.7571 nan 0.1000 -10680339.3683
## 1360 7203500588.1479 nan 0.1000 -4974080.6584
## 1380 7111802950.2971 nan 0.1000 -5790322.2306
## 1400 7014100664.6406 nan 0.1000 -9300435.2063
## 1420 6929011743.8843 nan 0.1000 -6015853.9934
## 1440 6830231478.8042 nan 0.1000 -6368149.0966
## 1460 6740709964.3529 nan 0.1000 -49966.6986
## 1480 6653070589.1327 nan 0.1000 -12728554.9439
## 1500 6548761206.1699 nan 0.1000 -10224355.0385
##
## - Fold4: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## + Fold5: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 249441323323.9969 nan 0.1000 32369174459.4137
## 2 224427238406.4524 nan 0.1000 24749932265.4135
## 3 202677715245.2307 nan 0.1000 23402960502.8531
## 4 184370331108.8295 nan 0.1000 17045401888.6369
## 5 167216536238.6490 nan 0.1000 14539973880.7579
## 6 153763856852.0132 nan 0.1000 12216982008.6676
## 7 141060975362.9128 nan 0.1000 13399373809.4912
## 8 131273436579.4751 nan 0.1000 8650179026.8062
## 9 122367101486.8148 nan 0.1000 7631805779.6312
## 10 114088146868.5761 nan 0.1000 7908673200.5407
## 20 69662044061.0137 nan 0.1000 2008703057.5094
## 40 49780453984.5049 nan 0.1000 27037710.8472
## 60 43629476019.6020 nan 0.1000 -55858272.4516
## 80 39221627424.1424 nan 0.1000 -93157256.5946
## 100 36201365582.2595 nan 0.1000 -170753719.3974
## 120 32865491476.7607 nan 0.1000 -190883905.5719
## 140 30133951261.6455 nan 0.1000 -183402444.6008
## 160 28153185089.0373 nan 0.1000 -121045204.6111
## 180 26515650222.0951 nan 0.1000 -57723929.5205
## 200 25357821193.4606 nan 0.1000 -35264445.6457
## 220 24034582761.8806 nan 0.1000 -84087409.4715
## 240 23099340556.4806 nan 0.1000 -62729524.4851
## 260 22211672675.8131 nan 0.1000 -60161303.3463
## 280 21232851001.6745 nan 0.1000 -105208424.5766
## 300 20454607825.0351 nan 0.1000 -48438891.5848
## 320 19871466249.7231 nan 0.1000 -42996077.2266
## 340 19119170857.9422 nan 0.1000 -46157995.9700
## 360 18516907768.9510 nan 0.1000 -11782309.7321
## 380 17837596689.0585 nan 0.1000 -41305925.4459
## 400 17200501663.5055 nan 0.1000 -29278895.4012
## 420 16634048488.9613 nan 0.1000 -75926348.4877
## 440 16252149441.4304 nan 0.1000 -28278893.5770
## 460 15839193049.5887 nan 0.1000 -41853254.0253
## 480 15433356699.8308 nan 0.1000 -43633437.2014
## 500 15024129576.3568 nan 0.1000 -33581970.2912
## 520 14553702017.7483 nan 0.1000 -22594284.4125
## 540 14179210201.0600 nan 0.1000 -20300266.6111
## 560 13851800045.5613 nan 0.1000 -11951556.9300
## 580 13555715273.1339 nan 0.1000 -54419550.9535
## 600 13241525533.2185 nan 0.1000 -8357695.8901
## 620 12883479201.0035 nan 0.1000 -61845310.0063
## 640 12558943960.5053 nan 0.1000 1780442.7596
## 660 12309115868.3898 nan 0.1000 -61932304.3848
## 680 11992381986.2955 nan 0.1000 -19658864.8769
## 700 11772480687.7340 nan 0.1000 -35327129.9466
## 720 11497667638.8400 nan 0.1000 -17387194.7824
## 740 11290174087.3763 nan 0.1000 -14210024.8585
## 760 11093387541.5122 nan 0.1000 -27087517.9950
## 780 10873207381.1095 nan 0.1000 -8523363.4424
## 800 10685024026.8116 nan 0.1000 -29769203.0009
## 820 10465672475.7820 nan 0.1000 -40174794.6871
## 840 10245832212.9343 nan 0.1000 -9648513.7614
## 860 10072937696.9654 nan 0.1000 -1052200.3885
## 880 9890443589.5718 nan 0.1000 -15018015.0021
## 900 9696857086.1785 nan 0.1000 -17196401.6831
## 920 9549374682.5350 nan 0.1000 -15147920.3493
## 940 9397514963.6224 nan 0.1000 -24704978.7893
## 960 9223552219.7737 nan 0.1000 -5923002.1492
## 980 9070277049.0842 nan 0.1000 -21481065.8755
## 1000 8931545973.1173 nan 0.1000 -5455972.7734
## 1020 8803179110.8739 nan 0.1000 -8487448.5506
## 1040 8663356019.2563 nan 0.1000 -6792095.6209
## 1060 8523115978.5429 nan 0.1000 -7661792.3292
## 1080 8396255874.4457 nan 0.1000 -14686291.8768
## 1100 8268326389.8093 nan 0.1000 -9969157.7524
## 1120 8143947315.5580 nan 0.1000 -11783097.2087
## 1140 8022043777.0868 nan 0.1000 -1816560.6272
## 1160 7910821765.6046 nan 0.1000 -1510231.0327
## 1180 7811939360.0336 nan 0.1000 -14918701.4743
## 1200 7718982371.8989 nan 0.1000 -12224451.7905
## 1220 7613253779.2366 nan 0.1000 -22894327.0832
## 1240 7516673250.6532 nan 0.1000 -9902650.7211
## 1260 7412813933.5619 nan 0.1000 -4702296.3277
## 1280 7311342938.8590 nan 0.1000 -10923970.5535
## 1300 7217697968.9592 nan 0.1000 -11600056.2409
## 1320 7122151818.6753 nan 0.1000 -6302328.5776
## 1340 7034220047.3386 nan 0.1000 -10337873.5333
## 1360 6936881286.0382 nan 0.1000 -7777454.2004
## 1380 6844255093.1500 nan 0.1000 -13976939.1926
## 1400 6742174962.3367 nan 0.1000 -4828884.8858
## 1420 6665487958.2994 nan 0.1000 -10107724.2242
## 1440 6573341655.0509 nan 0.1000 -9444701.9299
## 1460 6492153622.5679 nan 0.1000 -7063274.9875
## 1480 6417918976.5269 nan 0.1000 -9760365.9571
## 1500 6335937449.3803 nan 0.1000 -9811122.1510
##
## - Fold5: interaction.depth=6, n.trees=1500, shrinkage=0.1, n.minobsinnode=20
## Aggregating results
## Fitting final model on full training set
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 247937122803.3073 nan 0.1000 26928067941.0591
## 2 219482594573.7664 nan 0.1000 27696622926.5148
## 3 195998257603.3346 nan 0.1000 23799393113.4132
## 4 177300494020.4514 nan 0.1000 17649408944.1686
## 5 161631238335.1981 nan 0.1000 13676839887.1219
## 6 148645644791.4763 nan 0.1000 12483344018.5512
## 7 137888482408.7561 nan 0.1000 10449314249.2700
## 8 126564001540.9880 nan 0.1000 9664265188.4969
## 9 117759011844.5347 nan 0.1000 5968089301.7268
## 10 109823927187.9379 nan 0.1000 6970287103.3417
## 20 68104279738.4894 nan 0.1000 2311029258.7518
## 40 48143467868.8915 nan 0.1000 242456098.4519
## 60 42489262811.1228 nan 0.1000 -154802380.5658
## 80 38976540092.8548 nan 0.1000 18800498.1561
## 100 35924321722.7282 nan 0.1000 -106299511.8746
## 120 33757398612.1761 nan 0.1000 -125186384.9442
## 140 32060949780.4728 nan 0.1000 -202889412.6286
## 160 30556040965.4828 nan 0.1000 -197114303.6791
## 180 28572862329.2481 nan 0.1000 -152404982.9486
## 200 27272079842.2048 nan 0.1000 -124698753.8574
## 220 26234192901.0824 nan 0.1000 -104136724.3309
## 240 25154084280.4531 nan 0.1000 -86389905.8774
## 260 24130733029.7628 nan 0.1000 -25095759.6635
## 280 23202103431.4035 nan 0.1000 570392.2077
## 300 22367690693.0621 nan 0.1000 -55867865.2463
## 320 21510931887.8484 nan 0.1000 -122321536.1151
## 340 20729146033.1694 nan 0.1000 -17248167.7478
## 360 20066590130.0631 nan 0.1000 -81032373.9547
## 380 19304709784.0162 nan 0.1000 -95727481.6273
## 400 18740835482.0727 nan 0.1000 8994204.3449
## 420 18188574747.1229 nan 0.1000 -102540356.1008
## 440 17741716254.3114 nan 0.1000 -29860281.3965
## 460 17275099445.8511 nan 0.1000 -66571431.1910
## 480 16735310763.1297 nan 0.1000 -97839087.2427
## 500 16263510790.7657 nan 0.1000 -27191431.6754
## 520 15834131958.9991 nan 0.1000 -39158505.9847
## 540 15483574014.0491 nan 0.1000 -6724723.0537
## 560 15105076881.7728 nan 0.1000 -27932445.0611
## 580 14770847621.7911 nan 0.1000 -46330635.6817
## 600 14503280751.9179 nan 0.1000 -23491124.1331
## 620 14102199258.8450 nan 0.1000 -12411031.7476
## 640 13779739847.2408 nan 0.1000 -19882204.7525
## 660 13529666835.9760 nan 0.1000 -16646056.4063
## 680 13270400197.9259 nan 0.1000 -28244881.5618
## 700 12971761529.8940 nan 0.1000 -17761015.0122
## 720 12734025817.2245 nan 0.1000 -36266928.1544
## 740 12512891657.1334 nan 0.1000 -21915064.4381
## 760 12294928967.4861 nan 0.1000 -9114120.4217
## 780 12025479033.7098 nan 0.1000 -18852821.3759
## 800 11809556005.1355 nan 0.1000 -10863541.2604
## 820 11623781116.0073 nan 0.1000 -16301648.9411
## 840 11427894003.0327 nan 0.1000 -16794969.8801
## 860 11231482843.4158 nan 0.1000 -27776844.6950
## 880 11058893786.5036 nan 0.1000 -27436434.9310
## 900 10857377454.1935 nan 0.1000 -24779854.7500
## 920 10706805502.8016 nan 0.1000 -20374356.0200
## 940 10506098372.5636 nan 0.1000 -23792589.2379
## 960 10341111999.7245 nan 0.1000 -17113824.5931
## 980 10168696452.1936 nan 0.1000 -10009557.0688
## 1000 10008818347.7638 nan 0.1000 -10003210.1955
## 1020 9841784109.1772 nan 0.1000 -8785841.8133
## 1040 9716333817.8518 nan 0.1000 -15286040.8195
## 1060 9593562402.3732 nan 0.1000 -7322249.6581
## 1080 9442076853.1895 nan 0.1000 -20436213.3146
## 1100 9309906837.4085 nan 0.1000 588541.8727
## 1120 9170966605.3133 nan 0.1000 -14798839.9980
## 1140 9029595701.2382 nan 0.1000 -18785149.4204
## 1160 8901438973.3045 nan 0.1000 -10944135.7291
## 1180 8755598752.9136 nan 0.1000 -21521190.2713
## 1200 8654587170.7804 nan 0.1000 -10447951.1957
## 1220 8538957431.9643 nan 0.1000 -15117161.7343
## 1240 8423892240.9801 nan 0.1000 -581983.7039
## 1260 8316675132.9357 nan 0.1000 -5911425.7716
## 1280 8209476855.9256 nan 0.1000 -12313428.7181
## 1300 8089831574.7160 nan 0.1000 -5401613.1255
## 1320 7998079864.8987 nan 0.1000 -8536256.1158
## 1340 7896964922.6537 nan 0.1000 -9382989.5091
## 1360 7808128689.7541 nan 0.1000 -8653882.2216
## 1380 7711148334.3800 nan 0.1000 -13114018.4003
## 1400 7599635890.8823 nan 0.1000 -6105314.3701
## 1420 7513862093.7455 nan 0.1000 -3908287.3523
## 1440 7445191344.4467 nan 0.1000 -5429446.3583
## 1460 7356827891.3060 nan 0.1000 -3739604.7979
## 1480 7258767601.2715 nan 0.1000 -5558545.7926
## 1500 7177150070.5832 nan 0.1000 -10896921.6972
model1_gbm$results %>%
slice_max(order_by = RMSE, n=5)## interaction.depth n.trees shrinkage n.minobsinnode RMSE Rsquared
## 1 6 1500 0.1 20 212981.1 0.8373203
## MAE RMSESD RsquaredSD MAESD
## 1 99441.04 15961.81 0.02536064 1765.466
m_comp_table(model1_gbm, "model1_gbm")## # A tibble: 4 × 3
## model_name RMSE Rsquare
## <chr> <dbl> <dbl>
## 1 model1_lm 440579. 0.178
## 2 model2_lm 257836. 0.720
## 3 model2_tree 225941. 0.786
## 4 model1_gbm 184484. 0.859
var_importance(model1_gbm, "Gradient Boosting Machine")set.seed(123)
model1_knn <- train(
price ~ average_income:london_zone+
district +
water_company+
property_type+
freehold_or_leasehold+
latitude*longitude*total_floor_area*altitude+
average_income:number_habitable_rooms:total_floor_area+
I(average_income^3)+
energy_consumption_current+
energy_consumption_potential+
windows_energy_eff+
co2_emissions_current:co2_emissions_potential+
current_energy_rating+
distance_to_station+
num_tube_lines+
type_of_closest_station+
num_rail_lines+
num_light_rail_lines,
train_data,
method = "knn",
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE),
preProcess = c("center","scale"),
tuneLength = 10
)## + Fold1: k= 5
## - Fold1: k= 5
## + Fold1: k= 7
## - Fold1: k= 7
## + Fold1: k= 9
## - Fold1: k= 9
## + Fold1: k=11
## - Fold1: k=11
## + Fold1: k=13
## - Fold1: k=13
## + Fold1: k=15
## - Fold1: k=15
## + Fold1: k=17
## - Fold1: k=17
## + Fold1: k=19
## - Fold1: k=19
## + Fold1: k=21
## - Fold1: k=21
## + Fold1: k=23
## - Fold1: k=23
## + Fold2: k= 5
## - Fold2: k= 5
## + Fold2: k= 7
## - Fold2: k= 7
## + Fold2: k= 9
## - Fold2: k= 9
## + Fold2: k=11
## - Fold2: k=11
## + Fold2: k=13
## - Fold2: k=13
## + Fold2: k=15
## - Fold2: k=15
## + Fold2: k=17
## - Fold2: k=17
## + Fold2: k=19
## - Fold2: k=19
## + Fold2: k=21
## - Fold2: k=21
## + Fold2: k=23
## - Fold2: k=23
## + Fold3: k= 5
## - Fold3: k= 5
## + Fold3: k= 7
## - Fold3: k= 7
## + Fold3: k= 9
## - Fold3: k= 9
## + Fold3: k=11
## - Fold3: k=11
## + Fold3: k=13
## - Fold3: k=13
## + Fold3: k=15
## - Fold3: k=15
## + Fold3: k=17
## - Fold3: k=17
## + Fold3: k=19
## - Fold3: k=19
## + Fold3: k=21
## - Fold3: k=21
## + Fold3: k=23
## - Fold3: k=23
## + Fold4: k= 5
## - Fold4: k= 5
## + Fold4: k= 7
## - Fold4: k= 7
## + Fold4: k= 9
## - Fold4: k= 9
## + Fold4: k=11
## - Fold4: k=11
## + Fold4: k=13
## - Fold4: k=13
## + Fold4: k=15
## - Fold4: k=15
## + Fold4: k=17
## - Fold4: k=17
## + Fold4: k=19
## - Fold4: k=19
## + Fold4: k=21
## - Fold4: k=21
## + Fold4: k=23
## - Fold4: k=23
## + Fold5: k= 5
## - Fold5: k= 5
## + Fold5: k= 7
## - Fold5: k= 7
## + Fold5: k= 9
## - Fold5: k= 9
## + Fold5: k=11
## - Fold5: k=11
## + Fold5: k=13
## - Fold5: k=13
## + Fold5: k=15
## - Fold5: k=15
## + Fold5: k=17
## - Fold5: k=17
## + Fold5: k=19
## - Fold5: k=19
## + Fold5: k=21
## - Fold5: k=21
## + Fold5: k=23
## - Fold5: k=23
## Aggregating results
## Selecting tuning parameters
## Fitting k = 7 on full training set
model1_knn$results %>%
slice_max(order_by = RMSE, n=5)## k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 23 291012.6 0.7514432 133091.3 36782.46 0.010182578 4573.151
## 2 21 289179.4 0.7509963 132457.4 37013.93 0.011819152 4557.786
## 3 19 287148.5 0.7510276 131473.3 35478.65 0.009918782 4134.033
## 4 17 285354.2 0.7511004 130807.4 34775.65 0.009782122 4194.026
## 5 15 282134.8 0.7529384 130090.3 32936.11 0.009625775 3761.126
plot(model1_knn)m_comp_table(model1_knn, "model1_knn")## # A tibble: 5 × 3
## model_name RMSE Rsquare
## <chr> <dbl> <dbl>
## 1 model1_lm 440579. 0.178
## 2 model2_lm 257836. 0.720
## 3 model2_tree 225941. 0.786
## 4 model1_gbm 184484. 0.859
## 5 model1_knn 224306. 0.792
var_importance(model1_knn, "KNN")set.seed(123)
getModelInfo("lasso")## $blasso
## $blasso$label
## [1] "The Bayesian lasso"
##
## $blasso$library
## [1] "monomvn"
##
## $blasso$type
## [1] "Regression"
##
## $blasso$parameters
## parameter class label
## 1 sparsity numeric Sparsity Threshold
##
## $blasso$grid
## function (x, y, len = NULL, search = "grid")
## {
## if (len == 1)
## return(data.frame(sparsity = 0.5))
## if (search == "grid") {
## out <- expand.grid(sparsity = seq(0.3, 0.7, length = len))
## }
## else {
## out <- data.frame(sparsity = runif(len, min = 0, max = 1))
## }
## out
## }
##
## $blasso$loop
## function (grid)
## {
## grid <- grid[order(grid$sparsity, decreasing = TRUE), , drop = FALSE]
## loop <- grid[1, , drop = FALSE]
## submodels <- list(grid[-1, , drop = FALSE])
## list(loop = loop, submodels = submodels)
## }
##
## $blasso$fit
## function (x, y, wts, param, lev, last, classProbs, ...)
## {
## mod <- monomvn::blasso(as.matrix(x), y, ...)
## mod$.percent <- apply(mod$beta, 2, function(x) mean(x !=
## 0))
## mod$.sparsity <- param$sparsity
## mod$.betas <- colMeans(mod$beta)
## mod
## }
##
## $blasso$predict
## function (modelFit, newdata, submodels = NULL)
## {
## betas <- modelFit$.betas
## betas[modelFit$.percent <= modelFit$.sparsity] <- 0
## if (!is.matrix(newdata))
## newdata <- as.matrix(newdata)
## out <- (newdata %*% betas)[, 1]
## if (modelFit$icept)
## out <- out + mean(modelFit$mu)
## if (!is.null(submodels)) {
## tmp <- vector(mode = "list", length = nrow(submodels))
## for (i in 1:nrow(submodels)) {
## betas <- modelFit$.betas
## betas[modelFit$.percent <= submodels$sparsity[i]] <- 0
## tmp[[i]] <- (newdata %*% betas)[, 1]
## if (modelFit$icept)
## tmp[[i]] <- tmp[[i]] + mean(modelFit$mu)
## }
## out <- c(list(out), tmp)
## }
## out
## }
##
## $blasso$predictors
## function (x, s = NULL, ...)
## {
## x$xNames[x$.percent <= x$.sparsity]
## }
##
## $blasso$notes
## [1] "This model creates predictions using the mean of the posterior distributions but sets some parameters specifically to zero based on the tuning parameter `sparsity`. For example, when `sparsity = .5`, only coefficients where at least half the posterior estimates are nonzero are used."
##
## $blasso$tags
## [1] "Linear Regression" "Bayesian Model"
## [3] "Implicit Feature Selection" "L1 Regularization"
##
## $blasso$prob
## NULL
##
## $blasso$sort
## function (x)
## x[order(-x$sparsity), ]
##
##
## $blassoAveraged
## $blassoAveraged$label
## [1] "Bayesian Ridge Regression (Model Averaged)"
##
## $blassoAveraged$library
## [1] "monomvn"
##
## $blassoAveraged$type
## [1] "Regression"
##
## $blassoAveraged$parameters
## parameter class label
## 1 parameter character parameter
##
## $blassoAveraged$grid
## function (x, y, len = NULL, search = "grid")
## data.frame(parameter = "none")
##
## $blassoAveraged$fit
## function (x, y, wts, param, lev, last, classProbs, ...)
## {
## out <- monomvn::blasso(as.matrix(x), y, ...)
## out
## }
##
## $blassoAveraged$predict
## function (modelFit, newdata, submodels = NULL)
## {
## if (!is.matrix(newdata))
## newdata <- as.matrix(newdata)
## out <- modelFit$beta %*% t(newdata)
## if (modelFit$icept)
## out <- out + (matrix(1, ncol = ncol(out), nrow = nrow(out)) *
## modelFit$mu)
## apply(out, 2, mean)
## }
##
## $blassoAveraged$predictors
## function (x, s = NULL, ...)
## {
## x$xNames[apply(x$beta, 2, function(x) any(x != 0))]
## }
##
## $blassoAveraged$notes
## [1] "This model makes predictions by averaging the predictions based on the posterior estimates of the regression coefficients. While it is possible that some of these posterior estimates are zero for non-informative predictors, the final predicted value may be a function of many (or even all) predictors. "
##
## $blassoAveraged$tags
## [1] "Linear Regression" "Bayesian Model" "L1 Regularization"
##
## $blassoAveraged$prob
## NULL
##
## $blassoAveraged$sort
## function (x)
## x
##
##
## $lasso
## $lasso$label
## [1] "The lasso"
##
## $lasso$library
## [1] "elasticnet"
##
## $lasso$type
## [1] "Regression"
##
## $lasso$parameters
## parameter class label
## 1 fraction numeric Fraction of Full Solution
##
## $lasso$grid
## function (x, y, len = NULL, search = "grid")
## {
## if (search == "grid") {
## out <- expand.grid(fraction = seq(0.1, 0.9, length = len))
## }
## else {
## out <- data.frame(fraction = runif(len, min = 0, max = 1))
## }
## out
## }
##
## $lasso$loop
## function (grid)
## {
## grid <- grid[order(grid$fraction, decreasing = TRUE), , drop = FALSE]
## loop <- grid[1, , drop = FALSE]
## submodels <- list(grid[-1, , drop = FALSE])
## list(loop = loop, submodels = submodels)
## }
##
## $lasso$fit
## function (x, y, wts, param, lev, last, classProbs, ...)
## {
## elasticnet::enet(as.matrix(x), y, lambda = 0, ...)
## }
##
## $lasso$predict
## function (modelFit, newdata, submodels = NULL)
## {
## out <- elasticnet::predict.enet(modelFit, newdata, s = modelFit$tuneValue$fraction,
## mode = "fraction")$fit
## if (!is.null(submodels)) {
## if (nrow(submodels) > 1) {
## out <- c(list(if (is.matrix(out)) out[, 1] else out),
## as.list(as.data.frame(elasticnet::predict.enet(modelFit,
## newx = newdata, s = submodels$fraction, mode = "fraction")$fit)))
## }
## else {
## tmp <- elasticnet::predict.enet(modelFit, newx = newdata,
## s = submodels$fraction, mode = "fraction")$fit
## out <- c(list(if (is.matrix(out)) out[, 1] else out),
## list(tmp))
## }
## }
## out
## }
##
## $lasso$predictors
## function (x, s = NULL, ...)
## {
## if (is.null(s)) {
## if (!is.null(x$tuneValue)) {
## s <- x$tuneValue$fraction
## }
## else stop("must supply a vaue of s")
## out <- elasticnet::predict.enet(x, s = s, type = "coefficients",
## mode = "fraction")$coefficients
## }
## else {
## out <- elasticnet::predict.enet(x, s = s)$coefficients
## }
## names(out)[out != 0]
## }
##
## $lasso$tags
## [1] "Linear Regression" "Implicit Feature Selection"
## [3] "L1 Regularization"
##
## $lasso$prob
## NULL
##
## $lasso$sort
## function (x)
## x[order(x$fraction), ]
##
##
## $rqlasso
## $rqlasso$label
## [1] "Quantile Regression with LASSO penalty"
##
## $rqlasso$library
## [1] "rqPen"
##
## $rqlasso$type
## [1] "Regression"
##
## $rqlasso$parameters
## parameter class label
## 1 lambda numeric L1 Penalty
##
## $rqlasso$grid
## function (x, y, len = NULL, search = "grid")
## {
## if (search == "grid") {
## out <- expand.grid(lambda = c(10^seq(-1, -4, length = len)))
## }
## else {
## out <- data.frame(lambda = 10^runif(len, min = -5, 1))
## }
## out
## }
##
## $rqlasso$loop
## NULL
##
## $rqlasso$fit
## function (x, y, wts, param, lev, last, classProbs, ...)
## {
## rqPen::rq.lasso.fit(as.matrix(x), y, lambda = param$lambda,
## ...)
## }
##
## $rqlasso$predict
## function (modelFit, newdata, submodels = NULL)
## {
## predict(modelFit, newx = as.matrix(newdata))[, 1]
## }
##
## $rqlasso$predictors
## function (x, ...)
## {
## out <- coef(x)
## out <- out[names(out) != "intercept"]
## names(out)[out != 0]
## }
##
## $rqlasso$tags
## [1] "Linear Regression" "Quantile Regression"
## [3] "Implicit Feature Selection" "L1 Regularization"
##
## $rqlasso$prob
## NULL
##
## $rqlasso$sort
## function (x)
## x[order(-x$lambda), ]
model1_lasso <- train(
price ~ average_income:london_zone+
district +
water_company+
property_type+
freehold_or_leasehold+
latitude*longitude*total_floor_area*altitude+
average_income:number_habitable_rooms:total_floor_area+
I(average_income^3)+
energy_consumption_current+
energy_consumption_potential+
windows_energy_eff+
co2_emissions_current:co2_emissions_potential+
current_energy_rating+
distance_to_station+
num_tube_lines+
type_of_closest_station+
num_rail_lines+
num_light_rail_lines
,
train_data,
metric = "RMSE",
method = "glmnet",
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE),
tuneLength = 20
# tuneGrid = expand.grid(lambda = seq(0.00001, 0.0001, 0.00001))
)## + Fold1: alpha=0.1000, lambda=476377
## - Fold1: alpha=0.1000, lambda=476377
## + Fold1: alpha=0.1474, lambda=476377
## - Fold1: alpha=0.1474, lambda=476377
## + Fold1: alpha=0.1947, lambda=476377
## - Fold1: alpha=0.1947, lambda=476377
## + Fold1: alpha=0.2421, lambda=476377
## - Fold1: alpha=0.2421, lambda=476377
## + Fold1: alpha=0.2895, lambda=476377
## - Fold1: alpha=0.2895, lambda=476377
## + Fold1: alpha=0.3368, lambda=476377
## - Fold1: alpha=0.3368, lambda=476377
## + Fold1: alpha=0.3842, lambda=476377
## - Fold1: alpha=0.3842, lambda=476377
## + Fold1: alpha=0.4316, lambda=476377
## - Fold1: alpha=0.4316, lambda=476377
## + Fold1: alpha=0.4789, lambda=476377
## - Fold1: alpha=0.4789, lambda=476377
## + Fold1: alpha=0.5263, lambda=476377
## - Fold1: alpha=0.5263, lambda=476377
## + Fold1: alpha=0.5737, lambda=476377
## - Fold1: alpha=0.5737, lambda=476377
## + Fold1: alpha=0.6211, lambda=476377
## - Fold1: alpha=0.6211, lambda=476377
## + Fold1: alpha=0.6684, lambda=476377
## - Fold1: alpha=0.6684, lambda=476377
## + Fold1: alpha=0.7158, lambda=476377
## - Fold1: alpha=0.7158, lambda=476377
## + Fold1: alpha=0.7632, lambda=476377
## - Fold1: alpha=0.7632, lambda=476377
## + Fold1: alpha=0.8105, lambda=476377
## - Fold1: alpha=0.8105, lambda=476377
## + Fold1: alpha=0.8579, lambda=476377
## - Fold1: alpha=0.8579, lambda=476377
## + Fold1: alpha=0.9053, lambda=476377
## - Fold1: alpha=0.9053, lambda=476377
## + Fold1: alpha=0.9526, lambda=476377
## - Fold1: alpha=0.9526, lambda=476377
## + Fold1: alpha=1.0000, lambda=476377
## - Fold1: alpha=1.0000, lambda=476377
## + Fold2: alpha=0.1000, lambda=476377
## - Fold2: alpha=0.1000, lambda=476377
## + Fold2: alpha=0.1474, lambda=476377
## - Fold2: alpha=0.1474, lambda=476377
## + Fold2: alpha=0.1947, lambda=476377
## - Fold2: alpha=0.1947, lambda=476377
## + Fold2: alpha=0.2421, lambda=476377
## - Fold2: alpha=0.2421, lambda=476377
## + Fold2: alpha=0.2895, lambda=476377
## - Fold2: alpha=0.2895, lambda=476377
## + Fold2: alpha=0.3368, lambda=476377
## - Fold2: alpha=0.3368, lambda=476377
## + Fold2: alpha=0.3842, lambda=476377
## - Fold2: alpha=0.3842, lambda=476377
## + Fold2: alpha=0.4316, lambda=476377
## - Fold2: alpha=0.4316, lambda=476377
## + Fold2: alpha=0.4789, lambda=476377
## - Fold2: alpha=0.4789, lambda=476377
## + Fold2: alpha=0.5263, lambda=476377
## - Fold2: alpha=0.5263, lambda=476377
## + Fold2: alpha=0.5737, lambda=476377
## - Fold2: alpha=0.5737, lambda=476377
## + Fold2: alpha=0.6211, lambda=476377
## - Fold2: alpha=0.6211, lambda=476377
## + Fold2: alpha=0.6684, lambda=476377
## - Fold2: alpha=0.6684, lambda=476377
## + Fold2: alpha=0.7158, lambda=476377
## - Fold2: alpha=0.7158, lambda=476377
## + Fold2: alpha=0.7632, lambda=476377
## - Fold2: alpha=0.7632, lambda=476377
## + Fold2: alpha=0.8105, lambda=476377
## - Fold2: alpha=0.8105, lambda=476377
## + Fold2: alpha=0.8579, lambda=476377
## - Fold2: alpha=0.8579, lambda=476377
## + Fold2: alpha=0.9053, lambda=476377
## - Fold2: alpha=0.9053, lambda=476377
## + Fold2: alpha=0.9526, lambda=476377
## - Fold2: alpha=0.9526, lambda=476377
## + Fold2: alpha=1.0000, lambda=476377
## - Fold2: alpha=1.0000, lambda=476377
## + Fold3: alpha=0.1000, lambda=476377
## - Fold3: alpha=0.1000, lambda=476377
## + Fold3: alpha=0.1474, lambda=476377
## - Fold3: alpha=0.1474, lambda=476377
## + Fold3: alpha=0.1947, lambda=476377
## - Fold3: alpha=0.1947, lambda=476377
## + Fold3: alpha=0.2421, lambda=476377
## - Fold3: alpha=0.2421, lambda=476377
## + Fold3: alpha=0.2895, lambda=476377
## - Fold3: alpha=0.2895, lambda=476377
## + Fold3: alpha=0.3368, lambda=476377
## - Fold3: alpha=0.3368, lambda=476377
## + Fold3: alpha=0.3842, lambda=476377
## - Fold3: alpha=0.3842, lambda=476377
## + Fold3: alpha=0.4316, lambda=476377
## - Fold3: alpha=0.4316, lambda=476377
## + Fold3: alpha=0.4789, lambda=476377
## - Fold3: alpha=0.4789, lambda=476377
## + Fold3: alpha=0.5263, lambda=476377
## - Fold3: alpha=0.5263, lambda=476377
## + Fold3: alpha=0.5737, lambda=476377
## - Fold3: alpha=0.5737, lambda=476377
## + Fold3: alpha=0.6211, lambda=476377
## - Fold3: alpha=0.6211, lambda=476377
## + Fold3: alpha=0.6684, lambda=476377
## - Fold3: alpha=0.6684, lambda=476377
## + Fold3: alpha=0.7158, lambda=476377
## - Fold3: alpha=0.7158, lambda=476377
## + Fold3: alpha=0.7632, lambda=476377
## - Fold3: alpha=0.7632, lambda=476377
## + Fold3: alpha=0.8105, lambda=476377
## - Fold3: alpha=0.8105, lambda=476377
## + Fold3: alpha=0.8579, lambda=476377
## - Fold3: alpha=0.8579, lambda=476377
## + Fold3: alpha=0.9053, lambda=476377
## - Fold3: alpha=0.9053, lambda=476377
## + Fold3: alpha=0.9526, lambda=476377
## - Fold3: alpha=0.9526, lambda=476377
## + Fold3: alpha=1.0000, lambda=476377
## - Fold3: alpha=1.0000, lambda=476377
## + Fold4: alpha=0.1000, lambda=476377
## - Fold4: alpha=0.1000, lambda=476377
## + Fold4: alpha=0.1474, lambda=476377
## - Fold4: alpha=0.1474, lambda=476377
## + Fold4: alpha=0.1947, lambda=476377
## - Fold4: alpha=0.1947, lambda=476377
## + Fold4: alpha=0.2421, lambda=476377
## - Fold4: alpha=0.2421, lambda=476377
## + Fold4: alpha=0.2895, lambda=476377
## - Fold4: alpha=0.2895, lambda=476377
## + Fold4: alpha=0.3368, lambda=476377
## - Fold4: alpha=0.3368, lambda=476377
## + Fold4: alpha=0.3842, lambda=476377
## - Fold4: alpha=0.3842, lambda=476377
## + Fold4: alpha=0.4316, lambda=476377
## - Fold4: alpha=0.4316, lambda=476377
## + Fold4: alpha=0.4789, lambda=476377
## - Fold4: alpha=0.4789, lambda=476377
## + Fold4: alpha=0.5263, lambda=476377
## - Fold4: alpha=0.5263, lambda=476377
## + Fold4: alpha=0.5737, lambda=476377
## - Fold4: alpha=0.5737, lambda=476377
## + Fold4: alpha=0.6211, lambda=476377
## - Fold4: alpha=0.6211, lambda=476377
## + Fold4: alpha=0.6684, lambda=476377
## - Fold4: alpha=0.6684, lambda=476377
## + Fold4: alpha=0.7158, lambda=476377
## - Fold4: alpha=0.7158, lambda=476377
## + Fold4: alpha=0.7632, lambda=476377
## - Fold4: alpha=0.7632, lambda=476377
## + Fold4: alpha=0.8105, lambda=476377
## - Fold4: alpha=0.8105, lambda=476377
## + Fold4: alpha=0.8579, lambda=476377
## - Fold4: alpha=0.8579, lambda=476377
## + Fold4: alpha=0.9053, lambda=476377
## - Fold4: alpha=0.9053, lambda=476377
## + Fold4: alpha=0.9526, lambda=476377
## - Fold4: alpha=0.9526, lambda=476377
## + Fold4: alpha=1.0000, lambda=476377
## - Fold4: alpha=1.0000, lambda=476377
## + Fold5: alpha=0.1000, lambda=476377
## - Fold5: alpha=0.1000, lambda=476377
## + Fold5: alpha=0.1474, lambda=476377
## - Fold5: alpha=0.1474, lambda=476377
## + Fold5: alpha=0.1947, lambda=476377
## - Fold5: alpha=0.1947, lambda=476377
## + Fold5: alpha=0.2421, lambda=476377
## - Fold5: alpha=0.2421, lambda=476377
## + Fold5: alpha=0.2895, lambda=476377
## - Fold5: alpha=0.2895, lambda=476377
## + Fold5: alpha=0.3368, lambda=476377
## - Fold5: alpha=0.3368, lambda=476377
## + Fold5: alpha=0.3842, lambda=476377
## - Fold5: alpha=0.3842, lambda=476377
## + Fold5: alpha=0.4316, lambda=476377
## - Fold5: alpha=0.4316, lambda=476377
## + Fold5: alpha=0.4789, lambda=476377
## - Fold5: alpha=0.4789, lambda=476377
## + Fold5: alpha=0.5263, lambda=476377
## - Fold5: alpha=0.5263, lambda=476377
## + Fold5: alpha=0.5737, lambda=476377
## - Fold5: alpha=0.5737, lambda=476377
## + Fold5: alpha=0.6211, lambda=476377
## - Fold5: alpha=0.6211, lambda=476377
## + Fold5: alpha=0.6684, lambda=476377
## - Fold5: alpha=0.6684, lambda=476377
## + Fold5: alpha=0.7158, lambda=476377
## - Fold5: alpha=0.7158, lambda=476377
## + Fold5: alpha=0.7632, lambda=476377
## - Fold5: alpha=0.7632, lambda=476377
## + Fold5: alpha=0.8105, lambda=476377
## - Fold5: alpha=0.8105, lambda=476377
## + Fold5: alpha=0.8579, lambda=476377
## - Fold5: alpha=0.8579, lambda=476377
## + Fold5: alpha=0.9053, lambda=476377
## - Fold5: alpha=0.9053, lambda=476377
## + Fold5: alpha=0.9526, lambda=476377
## - Fold5: alpha=0.9526, lambda=476377
## + Fold5: alpha=1.0000, lambda=476377
## - Fold5: alpha=1.0000, lambda=476377
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 662 on full training set
coef(model1_lasso$finalModel, model1_lasso$finalModel$lambdaOpt) # model parameters of the best tune for lasso regression## 77 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 4.850698e+06
## districtBarnet 3.319663e+04
## districtBexley -7.616525e+04
## districtBrent 6.531871e+04
## districtBromley -7.520483e+04
## districtCamden 2.442847e+05
## districtCity of London 2.377738e+05
## districtCroydon -1.225369e+05
## districtEaling 1.385957e+04
## districtEnfield -5.959763e+03
## districtGreenwich -7.266465e+04
## districtHackney 8.505916e+04
## districtHammersmith and Fulham 2.371377e+05
## districtHaringey 4.858597e+04
## districtHarrow 3.900503e+04
## districtHavering 5.150975e+04
## districtHillingdon 7.964092e+04
## districtHounslow 6.887761e+04
## districtIslington 1.609322e+05
## districtKensington and Chelsea 9.367605e+05
## districtKingston upon Thames 2.236781e+04
## districtLambeth .
## districtLewisham -9.744222e+04
## districtMerton -3.826094e+04
## districtNewham -8.936304e+04
## districtRedbridge -6.866925e+04
## districtRichmond upon Thames 1.119354e+05
## districtSouthwark 3.395084e+04
## districtSutton -7.208374e+04
## districtTower Hamlets -2.893960e+04
## districtWaltham Forest 1.303843e+04
## districtWandsworth 1.109862e+04
## districtWestminster 5.619040e+05
## water_companyEssex & Suffolk Water -4.464684e+04
## water_companyLeep Utilities 3.322630e+04
## water_companySES Water 2.794350e+04
## water_companyThames Water 3.130278e+04
## property_typeF -1.519584e+04
## property_typeS -1.335340e+04
## property_typeT 1.558466e+04
## freehold_or_leaseholdL -7.169640e+03
## latitude -9.448527e+04
## longitude 6.008547e+05
## total_floor_area 2.854125e+03
## altitude 1.544557e+03
## I(average_income^3) 8.992057e-10
## energy_consumption_current 2.251584e+02
## energy_consumption_potential -2.168404e+02
## windows_energy_effGood 2.043349e+04
## windows_energy_effPoor 4.777248e+04
## windows_energy_effVery Good 4.361440e+04
## windows_energy_effVery Poor 2.699699e+04
## current_energy_ratingC -1.940559e+04
## current_energy_ratingD -2.766628e+04
## current_energy_ratingE -6.571431e+04
## current_energy_ratingF -1.483409e+05
## current_energy_ratingG -2.124953e+05
## distance_to_station -8.038599e+03
## num_tube_lines 8.118964e+03
## type_of_closest_stationrail -5.802396e+04
## type_of_closest_stationtube -3.492956e+04
## num_rail_lines 9.262781e+03
## num_light_rail_lines -4.693128e+04
## average_income:london_zone -9.384378e-01
## latitude:longitude 7.739864e+03
## latitude:total_floor_area 5.854560e+01
## longitude:total_floor_area -5.840198e+03
## latitude:altitude 3.239231e+01
## longitude:altitude -4.883245e+03
## total_floor_area:altitude -1.574600e+01
## co2_emissions_current:co2_emissions_potential 3.838565e+03
## latitude:longitude:total_floor_area -9.856796e+01
## latitude:longitude:altitude -5.102453e+01
## latitude:total_floor_area:altitude -2.592835e-01
## longitude:total_floor_area:altitude 5.919180e+01
## average_income:total_floor_area:number_habitable_rooms 1.370616e-03
## latitude:longitude:total_floor_area:altitude 6.846235e-01
model1_lasso$results %>%
# arrange(RMSE) %>%
slice_max(order_by = RMSE, n=5)## alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.8105263 476377.1 528614 NaN 286402.4 41756.82 NA 5938.273
## 2 0.8578947 476377.1 528614 NaN 286402.4 41756.82 NA 5938.273
## 3 0.9052632 476377.1 528614 NaN 286402.4 41756.82 NA 5938.273
## 4 0.9526316 476377.1 528614 NaN 286402.4 41756.82 NA 5938.273
## 5 1.0000000 476377.1 528614 NaN 286402.4 41756.82 NA 5938.273
m_comp_table(model1_lasso, "model1_lasso")## # A tibble: 6 × 3
## model_name RMSE Rsquare
## <chr> <dbl> <dbl>
## 1 model1_lm 440579. 0.178
## 2 model2_lm 257836. 0.720
## 3 model2_tree 225941. 0.786
## 4 model1_gbm 184484. 0.859
## 5 model1_knn 224306. 0.792
## 6 model1_lasso 259062. 0.716
var_importance(model1_lasso, "Lasso")Use stacking to ensemble your algorithms.
model_list <- caretList(
price ~ average_income:london_zone+
district +
water_company+
property_type+
freehold_or_leasehold+
latitude*longitude*total_floor_area*altitude+
average_income:number_habitable_rooms:total_floor_area+
I(average_income^3)+
energy_consumption_current+
energy_consumption_potential+
windows_energy_eff+
co2_emissions_current:co2_emissions_potential+
current_energy_rating+
distance_to_station+
type_of_closest_station+
num_rail_lines+
num_light_rail_lines
,
data = train_data,
metric = "RMSE",
tuneList = list(
rft = caretModelSpec(method = "ranger", tuneGrid = expand.grid(.mtry = 19,
.splitrule = "variance",
.min.node.size = 3)),
gbm = caretModelSpec(method = "gbm", tuneGrid = expand.grid( tuneGrid = expand.grid(n.trees = 1500,
interaction.depth = 20,
shrinkage = 0.1,
n.minobsinnode = 6))),
knn = caretModelSpec(method = "knn", tuneGrid = expand.grid(k = 7)),
lm = caretModelSpec(method = "lm"),
rpart = caretModelSpec(method = "rpart", tuneGrid = data.frame(cp = 0.00004 )),
lasso = caretModelSpec(method = "lasso", tuneGrid = data.frame(fraction =0.8578947 ))
),
trControl = trainControl(method = "cv",
number = 5,
verboseIter = TRUE)
)## + Fold1: mtry=19, splitrule=variance, min.node.size=3
## - Fold1: mtry=19, splitrule=variance, min.node.size=3
## + Fold2: mtry=19, splitrule=variance, min.node.size=3
## - Fold2: mtry=19, splitrule=variance, min.node.size=3
## + Fold3: mtry=19, splitrule=variance, min.node.size=3
## - Fold3: mtry=19, splitrule=variance, min.node.size=3
## + Fold4: mtry=19, splitrule=variance, min.node.size=3
## - Fold4: mtry=19, splitrule=variance, min.node.size=3
## + Fold5: mtry=19, splitrule=variance, min.node.size=3
## - Fold5: mtry=19, splitrule=variance, min.node.size=3
## Aggregating results
## Fitting final model on full training set
## + Fold1: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 239506057109.9528 nan 0.1000 33641681571.5707
## 2 203938156292.7404 nan 0.1000 30809455172.0012
## 3 176968220074.5434 nan 0.1000 25016640150.6639
## 4 154357509815.9872 nan 0.1000 17753396915.3231
## 5 135684277162.6166 nan 0.1000 17209702592.0303
## 6 119420860268.8380 nan 0.1000 13454618413.3293
## 7 105907205839.3454 nan 0.1000 13999700846.0757
## 8 95340405847.5161 nan 0.1000 10171978455.4489
## 9 85275278644.2772 nan 0.1000 8479097454.9256
## 10 77823046524.1036 nan 0.1000 6091714224.4009
## 20 39611800684.6123 nan 0.1000 1335613554.2017
## 40 23837922679.2426 nan 0.1000 -37089121.8222
## 60 19035214405.1438 nan 0.1000 -167090320.6492
## 80 15609670376.7953 nan 0.1000 -64533928.0644
## 100 13545480668.9110 nan 0.1000 -35460550.9330
## 120 11816253544.6905 nan 0.1000 -39982249.0969
## 140 10439170338.1130 nan 0.1000 -16229877.5721
## 160 9442835704.3915 nan 0.1000 -14805265.7296
## 180 8540966506.6616 nan 0.1000 -8935999.7610
## 200 7831816435.1366 nan 0.1000 -22895261.7377
## 220 7189108183.3234 nan 0.1000 -20730061.3941
## 240 6670525023.4344 nan 0.1000 -8246002.6203
## 260 6153565274.2280 nan 0.1000 -8777253.0138
## 280 5724237991.9574 nan 0.1000 -9285197.7174
## 300 5355619495.5944 nan 0.1000 -8739785.1877
## 320 4992816695.6909 nan 0.1000 -1653842.5666
## 340 4687320021.2810 nan 0.1000 -4056357.8285
## 360 4431322635.8349 nan 0.1000 -10036321.0175
## 380 4160926461.1435 nan 0.1000 -1179621.4333
## 400 3936269353.4263 nan 0.1000 -4249402.9952
## 420 3730515753.7530 nan 0.1000 -8720458.2605
## 440 3521065019.8245 nan 0.1000 -2302250.8323
## 460 3316131409.8624 nan 0.1000 -4906287.5283
## 480 3138882858.8764 nan 0.1000 -7130450.6686
## 500 2988933204.3360 nan 0.1000 -9442597.5890
## 520 2836995409.2115 nan 0.1000 -3769967.9275
## 540 2702093823.6394 nan 0.1000 -2926251.8323
## 560 2572073307.0224 nan 0.1000 -5480977.0753
## 580 2445481605.7467 nan 0.1000 -5127214.9560
## 600 2326393971.9882 nan 0.1000 -3509966.3226
## 620 2212448973.3187 nan 0.1000 -590012.5638
## 640 2107176188.8896 nan 0.1000 -4439270.8775
## 660 2004469810.8299 nan 0.1000 -4557418.0483
## 680 1909949134.6034 nan 0.1000 -651698.3497
## 700 1819601340.0795 nan 0.1000 -2494391.9804
## 720 1735172264.7520 nan 0.1000 -1317932.1616
## 740 1658632341.5667 nan 0.1000 -2773629.1257
## 760 1588984143.3212 nan 0.1000 -3305843.3872
## 780 1520104337.2872 nan 0.1000 -2821985.1373
## 800 1454884118.4158 nan 0.1000 -2404901.2273
## 820 1391929545.7740 nan 0.1000 -3455363.1293
## 840 1329476148.9527 nan 0.1000 -1307626.0480
## 860 1273954085.1474 nan 0.1000 -2498223.2564
## 880 1222537658.6550 nan 0.1000 -2270237.4404
## 900 1168680676.4550 nan 0.1000 -2968367.2503
## 920 1118566840.4679 nan 0.1000 -2343844.8649
## 940 1072901776.0363 nan 0.1000 -2241673.6238
## 960 1030497379.5183 nan 0.1000 -2852382.3851
## 980 988124437.9881 nan 0.1000 -1496898.4600
## 1000 945323033.0748 nan 0.1000 -2218057.2991
## 1020 906785883.4258 nan 0.1000 -950593.8871
## 1040 870082308.1271 nan 0.1000 -1830646.4425
## 1060 835163984.2608 nan 0.1000 -906208.6711
## 1080 799104454.2218 nan 0.1000 -1775943.2192
## 1100 766963919.7438 nan 0.1000 -1301761.6916
## 1120 738371396.8379 nan 0.1000 -815800.6311
## 1140 707745529.9964 nan 0.1000 -1516015.1363
## 1160 682048953.4880 nan 0.1000 -1043737.4255
## 1180 653631172.4939 nan 0.1000 -1212068.5474
## 1200 626374505.8359 nan 0.1000 -700754.4332
## 1220 602385565.8165 nan 0.1000 -1159554.6426
## 1240 577889880.4456 nan 0.1000 -680060.0918
## 1260 554691644.3614 nan 0.1000 -1023155.7652
## 1280 533059723.8604 nan 0.1000 -985237.7515
## 1300 511560109.7085 nan 0.1000 -913449.1576
## 1320 493147726.0173 nan 0.1000 -1050621.9464
## 1340 473449872.7639 nan 0.1000 -1164608.2129
## 1360 453964822.4383 nan 0.1000 -1066252.9511
## 1380 436088294.3588 nan 0.1000 -392969.0464
## 1400 419394286.0137 nan 0.1000 -1209696.6233
## 1420 403589582.7043 nan 0.1000 -408422.1139
## 1440 389293386.1677 nan 0.1000 -771832.1164
## 1460 374600527.9688 nan 0.1000 -1126846.8206
## 1480 359704195.1277 nan 0.1000 -781322.4504
## 1500 345080518.9524 nan 0.1000 -333010.7382
##
## - Fold1: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## + Fold2: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 248911482626.6854 nan 0.1000 41760608961.1864
## 2 214401703788.8302 nan 0.1000 32515170656.9051
## 3 187993615536.8674 nan 0.1000 26708236233.4888
## 4 165191453859.2851 nan 0.1000 21143548074.1441
## 5 144080235147.4048 nan 0.1000 19304947654.2402
## 6 126315491991.3147 nan 0.1000 14066858401.8768
## 7 111946687562.3668 nan 0.1000 10949294667.9282
## 8 100450146434.5654 nan 0.1000 11045632511.5978
## 9 91134618536.9370 nan 0.1000 9107735597.1646
## 10 82233330465.1759 nan 0.1000 7644107146.7795
## 20 42126624309.6338 nan 0.1000 1222062276.9261
## 40 25528632275.3350 nan 0.1000 146238261.9573
## 60 19557045474.9586 nan 0.1000 -88753804.6421
## 80 16285293599.4256 nan 0.1000 -206829030.3678
## 100 13951091460.4127 nan 0.1000 -38330162.9616
## 120 12104087690.7370 nan 0.1000 -14422034.3357
## 140 10778569532.5212 nan 0.1000 -62984637.1531
## 160 9705492330.1150 nan 0.1000 -34136848.3513
## 180 8797548948.1374 nan 0.1000 -27022229.5472
## 200 8038912045.2316 nan 0.1000 -33130381.4481
## 220 7377564451.4715 nan 0.1000 -16478124.4313
## 240 6779138559.5783 nan 0.1000 -16819868.5110
## 260 6307533077.0651 nan 0.1000 -9786375.1340
## 280 5898927833.6322 nan 0.1000 -12153996.2484
## 300 5480835367.4441 nan 0.1000 -24860684.8469
## 320 5157659525.3181 nan 0.1000 -12593200.7415
## 340 4842541846.4761 nan 0.1000 -13971710.5707
## 360 4579983544.6568 nan 0.1000 -4831571.0245
## 380 4300174121.9465 nan 0.1000 -5461477.0496
## 400 4071480139.2568 nan 0.1000 -17338126.7464
## 420 3838512784.5836 nan 0.1000 -14538570.6428
## 440 3619816281.9062 nan 0.1000 -16650975.8739
## 460 3424185023.5848 nan 0.1000 -5166015.0522
## 480 3260049697.0112 nan 0.1000 -6208218.7151
## 500 3091132808.3429 nan 0.1000 -6032302.7666
## 520 2935584504.1296 nan 0.1000 -5605675.9825
## 540 2785518216.4143 nan 0.1000 -4746649.0220
## 560 2646381700.3486 nan 0.1000 214268.4956
## 580 2517337115.6826 nan 0.1000 -6063165.8348
## 600 2401335801.3266 nan 0.1000 -3222238.7083
## 620 2286909967.0537 nan 0.1000 -5646434.8353
## 640 2177311292.1678 nan 0.1000 -4121417.0166
## 660 2071724674.1593 nan 0.1000 -5212728.8987
## 680 1976644467.1025 nan 0.1000 -5048370.7471
## 700 1877919384.2977 nan 0.1000 -5069068.0910
## 720 1798176947.8758 nan 0.1000 -3548688.2284
## 740 1719267823.1489 nan 0.1000 -4380973.1533
## 760 1643729390.2419 nan 0.1000 -2677685.3723
## 780 1571257679.0698 nan 0.1000 -4141631.1134
## 800 1498524061.9143 nan 0.1000 -2634983.3778
## 820 1434139236.9313 nan 0.1000 -2362517.4629
## 840 1374221593.1100 nan 0.1000 -3576271.1981
## 860 1309428272.4008 nan 0.1000 -2561541.2874
## 880 1253437805.8595 nan 0.1000 -1832870.3106
## 900 1197994248.1818 nan 0.1000 -2427198.9205
## 920 1143394100.5802 nan 0.1000 -3215036.4488
## 940 1093881880.3077 nan 0.1000 -1549841.1124
## 960 1048056582.9064 nan 0.1000 -1745413.8091
## 980 1006582141.9311 nan 0.1000 -2101310.5354
## 1000 965733226.9575 nan 0.1000 -1748709.1370
## 1020 926665460.0004 nan 0.1000 -2307929.0741
## 1040 887086100.6386 nan 0.1000 -1808947.7366
## 1060 850444980.9821 nan 0.1000 -621441.5923
## 1080 815204007.6745 nan 0.1000 -1318683.7473
## 1100 779987871.8608 nan 0.1000 -173860.7905
## 1120 750269926.3231 nan 0.1000 -1498058.1030
## 1140 721025304.1845 nan 0.1000 -1305523.7029
## 1160 690533706.6907 nan 0.1000 -1332575.0176
## 1180 663635403.0172 nan 0.1000 -1215911.4327
## 1200 638025128.7963 nan 0.1000 -1696713.2448
## 1220 612498585.2807 nan 0.1000 -1283509.3365
## 1240 586883676.0381 nan 0.1000 -1440209.5298
## 1260 564439554.5194 nan 0.1000 -1010361.0245
## 1280 542598275.1716 nan 0.1000 -693178.1467
## 1300 522362136.2946 nan 0.1000 -1449389.8122
## 1320 502082187.8734 nan 0.1000 -1147872.3789
## 1340 483088044.0629 nan 0.1000 -1345769.9423
## 1360 463934887.5482 nan 0.1000 -576999.2420
## 1380 445253941.9389 nan 0.1000 -858924.1718
## 1400 427862568.7084 nan 0.1000 -605878.8442
## 1420 410628788.8607 nan 0.1000 -909567.8739
## 1440 394086978.8786 nan 0.1000 -557320.0396
## 1460 378734943.3255 nan 0.1000 -984187.7803
## 1480 363383335.3589 nan 0.1000 -606699.2764
## 1500 350005872.1721 nan 0.1000 -1018769.1672
##
## - Fold2: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## + Fold3: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 234482602959.3299 nan 0.1000 34800324363.9858
## 2 200970642392.9247 nan 0.1000 30266582580.5127
## 3 176835642983.4241 nan 0.1000 21534123198.4728
## 4 155154630715.7007 nan 0.1000 22989222855.4670
## 5 136730925182.6691 nan 0.1000 17535758915.8310
## 6 120441692864.5611 nan 0.1000 15737208686.7855
## 7 108249547952.0278 nan 0.1000 11556223520.4266
## 8 96758940349.2286 nan 0.1000 9464138440.4250
## 9 86191841082.9088 nan 0.1000 9563462857.1827
## 10 78042360696.0059 nan 0.1000 6809621065.9037
## 20 41389986963.6489 nan 0.1000 924253962.1411
## 40 25545480313.1542 nan 0.1000 -317711500.9087
## 60 20186084479.7973 nan 0.1000 -104135783.9729
## 80 16797396824.9082 nan 0.1000 -115566431.3553
## 100 14361292781.2784 nan 0.1000 -86049551.4585
## 120 12712617946.7378 nan 0.1000 -168351.8279
## 140 11317270237.3507 nan 0.1000 -29972879.3722
## 160 10173332405.0954 nan 0.1000 -78721455.3298
## 180 9272486368.5912 nan 0.1000 -20729071.4632
## 200 8451328173.2067 nan 0.1000 -20065977.9189
## 220 7776689128.2794 nan 0.1000 -19309953.5676
## 240 7142248146.4005 nan 0.1000 -30890042.6398
## 260 6700827449.3878 nan 0.1000 -13141662.9407
## 280 6238139322.5542 nan 0.1000 -25675467.3105
## 300 5830683939.9599 nan 0.1000 -20569185.3719
## 320 5451063218.6233 nan 0.1000 -13584960.8078
## 340 5125092148.0945 nan 0.1000 -12379204.4993
## 360 4815744871.5956 nan 0.1000 -6589744.3450
## 380 4505068916.5619 nan 0.1000 -7552921.9826
## 400 4257965004.3162 nan 0.1000 -5388996.0767
## 420 4024946757.0374 nan 0.1000 -13019804.0677
## 440 3803073541.4898 nan 0.1000 -14645415.8997
## 460 3605440969.8344 nan 0.1000 -11974917.8454
## 480 3407272048.4621 nan 0.1000 -4858007.6553
## 500 3234252389.8072 nan 0.1000 -8503046.5253
## 520 3058189765.6805 nan 0.1000 -1768000.9679
## 540 2916284797.5293 nan 0.1000 -7272775.5828
## 560 2772930383.2892 nan 0.1000 -3647390.5661
## 580 2638390792.4095 nan 0.1000 -4448147.4459
## 600 2513780275.9926 nan 0.1000 -4271095.8293
## 620 2388442667.5919 nan 0.1000 -7230465.6429
## 640 2273752113.9279 nan 0.1000 -4647657.8776
## 660 2162408049.0329 nan 0.1000 -5909451.4656
## 680 2065840405.8734 nan 0.1000 -5195481.8753
## 700 1976675427.7687 nan 0.1000 -2498039.5687
## 720 1890488114.2441 nan 0.1000 -2082214.6183
## 740 1808117508.2022 nan 0.1000 -2486260.2125
## 760 1726558152.4197 nan 0.1000 -2162922.1328
## 780 1655555351.5074 nan 0.1000 -2831028.9844
## 800 1578775423.6053 nan 0.1000 -2384565.1119
## 820 1508014013.6129 nan 0.1000 -1142148.7788
## 840 1441488652.7447 nan 0.1000 -2099226.6588
## 860 1380157923.6981 nan 0.1000 -3543606.9034
## 880 1325795559.1740 nan 0.1000 -2525020.2621
## 900 1268276585.5020 nan 0.1000 -3158643.7672
## 920 1213147343.8435 nan 0.1000 -991422.1264
## 940 1159177002.4800 nan 0.1000 -2211761.1619
## 960 1110089925.4423 nan 0.1000 -1188750.4814
## 980 1061852593.9688 nan 0.1000 -1146766.9768
## 1000 1016671955.6900 nan 0.1000 -1504620.4035
## 1020 975094124.5166 nan 0.1000 -2071238.8642
## 1040 935069188.1856 nan 0.1000 -1230227.2557
## 1060 897561451.5997 nan 0.1000 -877473.5983
## 1080 857518194.0556 nan 0.1000 -1449736.2425
## 1100 823449705.3083 nan 0.1000 -1544471.6227
## 1120 791300016.0132 nan 0.1000 -1429200.3673
## 1140 760379011.1210 nan 0.1000 -1668953.6785
## 1160 731447288.6416 nan 0.1000 -1472586.0026
## 1180 701900591.9584 nan 0.1000 -2084471.0393
## 1200 674104578.2449 nan 0.1000 -1407461.3328
## 1220 648110180.3533 nan 0.1000 -1076159.4095
## 1240 623430407.9782 nan 0.1000 -496445.8419
## 1260 600049411.3367 nan 0.1000 -1745790.9838
## 1280 575173190.4261 nan 0.1000 -1563549.9829
## 1300 552720579.8777 nan 0.1000 -1005437.9093
## 1320 531834394.1488 nan 0.1000 -653477.9050
## 1340 511971872.1300 nan 0.1000 -1628412.3851
## 1360 492357346.5200 nan 0.1000 -785449.0292
## 1380 473253505.0595 nan 0.1000 -1223377.6836
## 1400 456620884.6813 nan 0.1000 -1081096.8377
## 1420 439531320.1108 nan 0.1000 -453344.9463
## 1440 423636082.6483 nan 0.1000 -150913.6936
## 1460 408586662.0713 nan 0.1000 -793533.3767
## 1480 393191347.0452 nan 0.1000 -736087.3050
## 1500 377888914.2814 nan 0.1000 -667407.1574
##
## - Fold3: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## + Fold4: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 233121884328.4700 nan 0.1000 36387256171.9617
## 2 200961443438.6931 nan 0.1000 33288712685.2779
## 3 173234204675.3751 nan 0.1000 28168167713.3648
## 4 152805550999.6260 nan 0.1000 23104032471.3218
## 5 136393333765.4269 nan 0.1000 13784434620.5362
## 6 121417255118.8586 nan 0.1000 13886045994.2684
## 7 107341468301.8206 nan 0.1000 11316922337.4371
## 8 96756985367.4346 nan 0.1000 10216509434.5796
## 9 87791618873.3507 nan 0.1000 7321960862.7750
## 10 79298992889.6617 nan 0.1000 5442118561.7812
## 20 41239917939.7117 nan 0.1000 688442448.1888
## 40 24966460563.6737 nan 0.1000 -115656325.7545
## 60 19945498651.4031 nan 0.1000 -270221534.9709
## 80 16618267449.2749 nan 0.1000 -48334367.7666
## 100 14150018305.0289 nan 0.1000 -25405237.7374
## 120 12481335254.6618 nan 0.1000 -15046748.7350
## 140 11064213484.3781 nan 0.1000 -19549020.0795
## 160 9834949934.4712 nan 0.1000 -82964980.1510
## 180 8898117909.7894 nan 0.1000 -41493463.9529
## 200 8090427284.9398 nan 0.1000 -21072594.0062
## 220 7448791877.5296 nan 0.1000 -31219654.2083
## 240 6930777599.6863 nan 0.1000 -40654255.8856
## 260 6422789281.3756 nan 0.1000 -19304450.7107
## 280 6001436842.2327 nan 0.1000 -18019512.6524
## 300 5574608385.5694 nan 0.1000 -11307360.0737
## 320 5217885578.3485 nan 0.1000 -12651831.5615
## 340 4905846618.1692 nan 0.1000 -4346015.7499
## 360 4591179903.3186 nan 0.1000 -7283491.4970
## 380 4324474439.3129 nan 0.1000 -10470062.3368
## 400 4073773104.0255 nan 0.1000 -5756563.4233
## 420 3866225606.2646 nan 0.1000 -1942074.4757
## 440 3652924981.6435 nan 0.1000 -8725471.5262
## 460 3451770064.3737 nan 0.1000 -6082113.0602
## 480 3258476175.2390 nan 0.1000 -5877747.0702
## 500 3082403439.1213 nan 0.1000 -1934678.6780
## 520 2928520974.0669 nan 0.1000 -3002974.0383
## 540 2769206413.0165 nan 0.1000 -1315050.6522
## 560 2635043859.9642 nan 0.1000 -2845036.4448
## 580 2501358272.2820 nan 0.1000 -4703696.1636
## 600 2379842583.8806 nan 0.1000 -5292461.2991
## 620 2259792211.4067 nan 0.1000 -3246484.8101
## 640 2152323644.7347 nan 0.1000 -5934079.0322
## 660 2042159020.5133 nan 0.1000 -3208875.9858
## 680 1946220253.7778 nan 0.1000 -3449958.2175
## 700 1850119823.1552 nan 0.1000 -2426724.2584
## 720 1764957835.9021 nan 0.1000 -1656154.1876
## 740 1683440936.9969 nan 0.1000 -1881895.0977
## 760 1610465906.1718 nan 0.1000 -1974260.6848
## 780 1537835437.9410 nan 0.1000 -1915408.8784
## 800 1471263070.8499 nan 0.1000 -2187099.6524
## 820 1405412099.4750 nan 0.1000 -2864975.1634
## 840 1345467540.6704 nan 0.1000 -1781818.2132
## 860 1284112694.3021 nan 0.1000 -1033801.0314
## 880 1228511384.7695 nan 0.1000 -2131905.6926
## 900 1175828691.3132 nan 0.1000 -1699350.7296
## 920 1126868620.2977 nan 0.1000 -1323612.7791
## 940 1080336627.5545 nan 0.1000 -2696691.4681
## 960 1034784034.5754 nan 0.1000 -1423000.5860
## 980 993507439.1745 nan 0.1000 -770384.6284
## 1000 955083086.5235 nan 0.1000 -2242069.1778
## 1020 915259574.1239 nan 0.1000 -1921043.6683
## 1040 880211410.7529 nan 0.1000 -1328014.5198
## 1060 844514923.5192 nan 0.1000 -1446966.5911
## 1080 808006271.3235 nan 0.1000 -611645.7317
## 1100 776243460.9320 nan 0.1000 -1445806.9935
## 1120 743978855.4930 nan 0.1000 -1440203.7486
## 1140 715410879.8794 nan 0.1000 -1409211.6374
## 1160 686095519.2614 nan 0.1000 -1705076.7938
## 1180 658112151.9213 nan 0.1000 -1030954.9728
## 1200 633597814.9575 nan 0.1000 -1261116.2568
## 1220 606746046.1998 nan 0.1000 -1617010.4100
## 1240 583252790.3381 nan 0.1000 -611778.0085
## 1260 560418051.0583 nan 0.1000 -643425.4172
## 1280 538932855.1942 nan 0.1000 -1074348.1058
## 1300 517437985.3314 nan 0.1000 -1047080.2481
## 1320 496612971.5034 nan 0.1000 -1077310.5065
## 1340 476361419.4834 nan 0.1000 -589121.4978
## 1360 458412536.5357 nan 0.1000 -1353139.4387
## 1380 440934625.9902 nan 0.1000 -1022420.0799
## 1400 424946582.8470 nan 0.1000 -889001.5754
## 1420 409365661.8094 nan 0.1000 -933485.4406
## 1440 394047034.0615 nan 0.1000 -489628.6357
## 1460 378905482.9305 nan 0.1000 -782448.0350
## 1480 363942219.5921 nan 0.1000 -388216.7140
## 1500 350632792.2664 nan 0.1000 -732076.5205
##
## - Fold4: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## + Fold5: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 247083399408.5962 nan 0.1000 37501241354.7895
## 2 214190875658.2563 nan 0.1000 28261663397.5734
## 3 185556000383.0656 nan 0.1000 24753704228.4856
## 4 164541525449.9209 nan 0.1000 25141556675.4624
## 5 145344977097.3775 nan 0.1000 16220598536.3777
## 6 129324380833.5192 nan 0.1000 15071061824.1229
## 7 116509782718.4973 nan 0.1000 11959335348.3603
## 8 104684922971.7417 nan 0.1000 10222539400.1551
## 9 94158554142.5341 nan 0.1000 9369006692.4853
## 10 86068300284.4621 nan 0.1000 7897224000.7506
## 20 45495866004.0047 nan 0.1000 961294329.5218
## 40 25771419186.8758 nan 0.1000 101240377.2334
## 60 19844101923.2155 nan 0.1000 -4968505.5322
## 80 16266159040.7614 nan 0.1000 -102550844.5439
## 100 13954600796.7425 nan 0.1000 3093667.7803
## 120 12162761532.8955 nan 0.1000 -58624180.0853
## 140 10865172127.6884 nan 0.1000 -3789307.5080
## 160 9812278280.1839 nan 0.1000 -22387305.6020
## 180 8905962098.1480 nan 0.1000 -61729433.9885
## 200 8106048215.7857 nan 0.1000 -32927105.4332
## 220 7452313429.7202 nan 0.1000 -18828340.7608
## 240 6834811729.3136 nan 0.1000 -15552314.0241
## 260 6356747862.4643 nan 0.1000 -19568189.7910
## 280 5902364964.4194 nan 0.1000 -18185076.3666
## 300 5507616904.6633 nan 0.1000 -6162470.0544
## 320 5165741931.3472 nan 0.1000 -5028224.9325
## 340 4840207778.6727 nan 0.1000 -13597304.4406
## 360 4571357818.9321 nan 0.1000 -10084071.5439
## 380 4307973834.3292 nan 0.1000 -15785446.6860
## 400 4068526508.5216 nan 0.1000 -14386730.7848
## 420 3833286020.0327 nan 0.1000 -5815231.3398
## 440 3624902374.7527 nan 0.1000 -9093130.1954
## 460 3429391781.9701 nan 0.1000 -4403124.8424
## 480 3247512854.9785 nan 0.1000 -3233953.5223
## 500 3084583535.1299 nan 0.1000 -4622351.9705
## 520 2922638273.2607 nan 0.1000 -6776781.8880
## 540 2782081950.7205 nan 0.1000 -4430140.8079
## 560 2650588364.7026 nan 0.1000 -2389542.3791
## 580 2515177825.7343 nan 0.1000 -4830747.4623
## 600 2382989490.2003 nan 0.1000 -5423105.4529
## 620 2270117075.4417 nan 0.1000 -6859453.2752
## 640 2161451289.3375 nan 0.1000 -6666582.5251
## 660 2056633697.8121 nan 0.1000 -4079551.4947
## 680 1961825901.5242 nan 0.1000 -3377136.0361
## 700 1866375547.6987 nan 0.1000 -3332009.2503
## 720 1779976284.9989 nan 0.1000 -2123427.3447
## 740 1703147469.6976 nan 0.1000 -2586867.1983
## 760 1627972406.8891 nan 0.1000 -2598525.9978
## 780 1558957416.9342 nan 0.1000 -1990715.5399
## 800 1486015277.8966 nan 0.1000 -3578333.9256
## 820 1423012762.2545 nan 0.1000 -2775064.3436
## 840 1364875727.8914 nan 0.1000 -4710660.2606
## 860 1306626464.8090 nan 0.1000 -2638789.8846
## 880 1248415323.4075 nan 0.1000 -2309984.4540
## 900 1194212788.3032 nan 0.1000 -2609259.8417
## 920 1145376684.1303 nan 0.1000 -2078252.6511
## 940 1099357047.4095 nan 0.1000 -1463621.9488
## 960 1053487650.1578 nan 0.1000 -1344243.1336
## 980 1012251119.4506 nan 0.1000 -2055021.9353
## 1000 970761608.1475 nan 0.1000 -1625281.5621
## 1020 931149561.0228 nan 0.1000 -2389949.6830
## 1040 891452032.0176 nan 0.1000 -1200632.2636
## 1060 854626166.5113 nan 0.1000 -1601219.4156
## 1080 821206838.5151 nan 0.1000 -1811591.1173
## 1100 786277341.1577 nan 0.1000 -1598537.5702
## 1120 754522753.9770 nan 0.1000 -1741803.8503
## 1140 722286910.0023 nan 0.1000 -1379078.0283
## 1160 693419365.8271 nan 0.1000 -1148727.4660
## 1180 666911195.5088 nan 0.1000 -1416984.5652
## 1200 640766060.3204 nan 0.1000 -1068897.4883
## 1220 616016950.2246 nan 0.1000 -1650803.6851
## 1240 590227658.6053 nan 0.1000 -2169734.1685
## 1260 566762658.2255 nan 0.1000 -1858415.1432
## 1280 545547535.6821 nan 0.1000 -1038303.9203
## 1300 523954315.6165 nan 0.1000 -655315.5955
## 1320 502979816.4506 nan 0.1000 -939184.5008
## 1340 484608997.6856 nan 0.1000 -884484.8674
## 1360 464085640.3439 nan 0.1000 -849878.5048
## 1380 446476045.0519 nan 0.1000 -667002.9491
## 1400 429405590.8392 nan 0.1000 -894615.2848
## 1420 414379173.3425 nan 0.1000 -1354706.8178
## 1440 397999231.5322 nan 0.1000 -698615.8639
## 1460 383498945.5415 nan 0.1000 -1051588.5810
## 1480 369204484.2655 nan 0.1000 -823006.4269
## 1500 355684544.8607 nan 0.1000 -588958.3607
##
## - Fold5: n.trees=1500, interaction.depth=20, shrinkage=0.1, n.minobsinnode=6
## Aggregating results
## Fitting final model on full training set
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 240747470706.3436 nan 0.1000 40335756460.8261
## 2 209276469863.1263 nan 0.1000 31830291195.3716
## 3 182501785356.2228 nan 0.1000 23797901996.6309
## 4 160216123485.5330 nan 0.1000 21348222492.0886
## 5 140319923302.6803 nan 0.1000 16570816125.8933
## 6 123785802334.8807 nan 0.1000 13959713386.3977
## 7 109946082787.6215 nan 0.1000 11834875711.3455
## 8 99585949289.3399 nan 0.1000 9631709479.1579
## 9 89021441892.5609 nan 0.1000 9423287298.2098
## 10 80029291799.4756 nan 0.1000 6412436060.6266
## 20 42424020255.2251 nan 0.1000 1350224473.3087
## 40 25342796949.2491 nan 0.1000 25616816.6570
## 60 20007469725.4679 nan 0.1000 -71559955.0431
## 80 17083225316.2219 nan 0.1000 -77154748.3482
## 100 14744242618.1390 nan 0.1000 -9800602.3626
## 120 12763993914.8526 nan 0.1000 29865587.4144
## 140 11487256265.9115 nan 0.1000 -29066212.6424
## 160 10326887392.4955 nan 0.1000 -19093915.2968
## 180 9532581881.1675 nan 0.1000 -20063424.2863
## 200 8755617837.2249 nan 0.1000 -13069103.3327
## 220 8094207828.1812 nan 0.1000 -8498180.3179
## 240 7558109260.5741 nan 0.1000 -22127176.8729
## 260 7089753924.3190 nan 0.1000 -9254239.8550
## 280 6658204941.7444 nan 0.1000 -15875009.7104
## 300 6244789842.6136 nan 0.1000 -3215898.4804
## 320 5890909843.5875 nan 0.1000 -4738513.4358
## 340 5565311200.1322 nan 0.1000 -14768313.7122
## 360 5268330787.4495 nan 0.1000 -517603.0626
## 380 4993988423.3547 nan 0.1000 -4711173.0775
## 400 4756729470.3140 nan 0.1000 -10161816.7612
## 420 4521520766.1873 nan 0.1000 -5396938.8384
## 440 4299254632.4543 nan 0.1000 -7683085.4257
## 460 4105814467.8920 nan 0.1000 -6458333.5558
## 480 3900278492.9730 nan 0.1000 -5489366.2752
## 500 3716259117.6398 nan 0.1000 -5365170.1295
## 520 3555051806.8844 nan 0.1000 -8710448.2723
## 540 3405546151.9654 nan 0.1000 -4490390.9139
## 560 3261714593.5441 nan 0.1000 -6000396.1786
## 580 3119390145.9773 nan 0.1000 -4574901.8639
## 600 2991432187.1749 nan 0.1000 -3384402.8280
## 620 2863821901.5312 nan 0.1000 -4768467.2596
## 640 2748948060.3182 nan 0.1000 -4806343.9693
## 660 2636289930.6459 nan 0.1000 -3183854.2250
## 680 2538069744.2658 nan 0.1000 -3317600.4216
## 700 2434138189.3943 nan 0.1000 -2081388.6240
## 720 2335075789.4497 nan 0.1000 -2675848.2069
## 740 2242785601.9650 nan 0.1000 -3694878.6293
## 760 2156446327.8471 nan 0.1000 -1762380.1278
## 780 2072714864.5915 nan 0.1000 -2657512.3763
## 800 1999934484.2326 nan 0.1000 -3407683.4034
## 820 1923372846.5349 nan 0.1000 -4496203.3325
## 840 1851286315.4557 nan 0.1000 -2855254.2480
## 860 1784616547.7936 nan 0.1000 -1336991.1305
## 880 1712893074.0278 nan 0.1000 -2644431.5051
## 900 1654180783.5647 nan 0.1000 -3125771.7662
## 920 1594749851.5510 nan 0.1000 -3481019.6937
## 940 1534868256.6099 nan 0.1000 -2344918.3662
## 960 1481497350.4711 nan 0.1000 -1043770.3833
## 980 1430009297.1662 nan 0.1000 -1011523.6244
## 1000 1381617372.4841 nan 0.1000 -3167348.4746
## 1020 1336125576.7027 nan 0.1000 -488885.8330
## 1040 1290636209.6828 nan 0.1000 -2540741.3290
## 1060 1243232073.0791 nan 0.1000 -1804643.1677
## 1080 1197971429.7697 nan 0.1000 -2218861.0984
## 1100 1157554347.7934 nan 0.1000 -1772743.7128
## 1120 1116051551.8456 nan 0.1000 -1633765.5985
## 1140 1076717714.4043 nan 0.1000 -1506510.9265
## 1160 1041551192.7901 nan 0.1000 -1778366.5566
## 1180 1004919242.0786 nan 0.1000 -1784059.3585
## 1200 973777031.6280 nan 0.1000 -1631218.8398
## 1220 941812918.5231 nan 0.1000 -1292715.7271
## 1240 911195280.1575 nan 0.1000 -1377244.8753
## 1260 880061311.2408 nan 0.1000 -1223214.1904
## 1280 850260281.6236 nan 0.1000 -1721176.5105
## 1300 824080252.2648 nan 0.1000 -1490825.0617
## 1320 798311858.4344 nan 0.1000 -1459540.8684
## 1340 772444397.2523 nan 0.1000 -370716.8194
## 1360 747998476.5938 nan 0.1000 -694002.4737
## 1380 721919721.8959 nan 0.1000 -1046810.3980
## 1400 698685534.4784 nan 0.1000 -974183.2226
## 1420 675308969.9010 nan 0.1000 -1221962.9745
## 1440 654142955.9583 nan 0.1000 -876943.3880
## 1460 634574065.0604 nan 0.1000 -638952.0399
## 1480 614461170.2290 nan 0.1000 -1211762.7914
## 1500 595103003.3981 nan 0.1000 -947201.6213
##
## + Fold1: k=7
## - Fold1: k=7
## + Fold2: k=7
## - Fold2: k=7
## + Fold3: k=7
## - Fold3: k=7
## + Fold4: k=7
## - Fold4: k=7
## + Fold5: k=7
## - Fold5: k=7
## Aggregating results
## Fitting final model on full training set
## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
## + Fold1: cp=4e-05
## - Fold1: cp=4e-05
## + Fold2: cp=4e-05
## - Fold2: cp=4e-05
## + Fold3: cp=4e-05
## - Fold3: cp=4e-05
## + Fold4: cp=4e-05
## - Fold4: cp=4e-05
## + Fold5: cp=4e-05
## - Fold5: cp=4e-05
## Aggregating results
## Fitting final model on full training set
## + Fold1: fraction=0.8579
## - Fold1: fraction=0.8579
## + Fold2: fraction=0.8579
## - Fold2: fraction=0.8579
## + Fold3: fraction=0.8579
## - Fold3: fraction=0.8579
## + Fold4: fraction=0.8579
## - Fold4: fraction=0.8579
## + Fold5: fraction=0.8579
## - Fold5: fraction=0.8579
## Aggregating results
## Fitting final model on full training set
glm_ensemble <- caretEnsemble::caretStack(model_list,
method = "glm",
metric = "RMSE",
trControl = trainControl("cv",
10))
glm_ensemble$error## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 211112.4 0.8385082 96818.32 32855.95 0.05147205 4612.139
m_comp_table(glm_ensemble, "ensemble_stacked")## # A tibble: 8 × 3
## model_name RMSE Rsquare
## <chr> <dbl> <dbl>
## 1 model1_lm 440579. 0.178
## 2 model2_lm 257836. 0.720
## 3 model2_tree 225941. 0.786
## 4 model1_rft 177126. 0.868
## 5 model1_gbm 184484. 0.859
## 6 model1_knn 224306. 0.792
## 7 model1_lasso 259062. 0.716
## 8 ensemble_stacked 170402. 0.878
predictions <- predict(glm_ensemble,test_data)
resamples <- resamples(model_list)
dotplot(resamples, metric = "RMSE")summary(glm_ensemble)##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2965486 -51919 2254 48926 4736306
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.951e+04 4.209e+03 -9.388 < 2e-16 ***
## rft 4.765e-01 2.752e-02 17.313 < 2e-16 ***
## gbm 4.619e-01 1.886e-02 24.495 < 2e-16 ***
## knn 6.793e-02 9.937e-03 6.836 8.57e-12 ***
## lm 1.320e+00 3.320e-01 3.975 7.09e-05 ***
## rpart 7.588e-02 1.294e-02 5.866 4.59e-09 ***
## lasso -1.332e+00 3.335e-01 -3.995 6.52e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 44493413020)
##
## Null deviance: 2.9480e+15 on 10497 degrees of freedom
## Residual deviance: 4.6678e+14 on 10491 degrees of freedom
## AIC: 287197
##
## Number of Fisher Scoring iterations: 2
In this section you should use the best algorithm you identified to choose 200 properties from the out of sample data.
numchoose=200
oos<-london_house_prices_2019_out_of_sample
#predict the value of houses
oos$predict <- predict(glm_ensemble,oos)
#Choose the ones you want to invest here
#Make sure you choose exactly 200 of them
# pick 200 properties based on the highest percentage returns predicted
buy_decision <- oos %>%
mutate(price_delta = (predict - asking_price)/asking_price) %>%
slice_max(order_by = price_delta, n = numchoose)
oos <- oos %>%
mutate(buy = case_when(ID %in% buy_decision$ID ~ 1,
TRUE ~0))
#output your choices. Change the name of the file to your "lastname_firstname.csv"
write.csv(oos,"Nagy-Betegh_Kazmer.csv")