The data is from sberbank housing price competition from kaggle.
We will need to download the data from kaggle and read in the data. (I have already download it in my local directory. )
# Load the package for read in data.
library(data.table)
# Read in the data in the local directory
sberbank=fread("train.csv",header=TRUE)
Read 98.5% of 30471 rows
Read 30471 rows and 292 (of 292) columns from 0.043 GB file in 00:00:03
# Select rows of data with no missing value (For simplicity, we do not deal with missing value in the data here)
sberbank=sberbank[complete.cases(sberbank),]
sberbank$timestamp=as.Date(sberbank$timestamp)
change character values into factors and delete some values for simplicity of demonstration.
library(tidyverse)
# sberbank=sberbank%>%select(-water_1line,-culture_objects_top_25,-thermal_power_plant_raion,-incineration_raion ,-oil_chemistry_raion,-radiation_raion ,-railroad_terminal_raion,-big_market_raion,-nuclear_reactor_raion , -detention_facility_raion)
sberbank$product_type=as.factor(sberbank$product_type)
sberbank$sub_area=as.factor(sberbank$product_type)
# To decrease the time for trainig the model, we only select several variables for training.
sberbank=sberbank%>%select(price_doc,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,sub_area)
More data preprocessing
##############################################################################################
# Record the time
t1=proc.time()
# Load libraries
library(caret) # Package for machine learning algorithms
library(doParallel) # Package for parallel computing
library(MLmetrics) # Package for calculating model performance
library(tidyverse) # Package for data manipulation
# PreProcess the dependent variable data: center,scale,zv,nzv
data1=sberbank%>%select(-price_doc)
data1=data1%>%preProcess("center","scale","zv")%>%predict(newdata=data1)
data1=cbind(data1,price_doc=sberbank$price_doc)
# Split the data
data=data1
set.seed(3) # For reproducible purpose
index=createDataPartition(data$price_doc,p=0.9,list=FALSE)
train=data[index,]
test=data[-index,]
# Define number of cores to use in training the model
cl_num=3
# Register parallel backend
cl <- makeCluster(cl_num)
registerDoParallel(cl)
# Set the training control
ctrl=trainControl(method="repeatedcv",number=5,repeats = 1)
# Set the parameter search grid
max_split=length(colnames(data))-1
rf_grid=expand.grid(mtry=2:max_split)
set.seed(3) # For reproducible purpose
rf=train(log(price_doc)~., data=train, method="rf", trControl=ctrl,metric="RMSE", tuneGrid=rf_grid,importance=TRUE)
Loading required package: randomForest
randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 愼㸱愼㹥randomForest愼㸱愼㹦
The following object is masked from 愼㸱愼㹥package:dplyr愼㸱愼㹦:
combine
The following object is masked from 愼㸱愼㹥package:ggplot2愼㸱愼㹦:
margin
varImp(rf)
rf variable importance
Overall
full_sq 100.00
build_year 90.35
kitch_sq 75.45
max_floor 68.76
life_sq 66.39
state 47.93
num_room 47.04
material 33.41
floor 32.25
product_typeOwnerOccupier 25.94
sub_areaOwnerOccupier 20.70
timestamp 0.00
gc() # grabage collection
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2360247 126.1 3886542 207.6 3886542 207.6
Vcells 9427190 72.0 25856534 197.3 37429521 285.6
# Define number of cores to use in training the model
cl_num=3
# Register parallel backend
cl <- makeCluster(cl_num)
registerDoParallel(cl)
# Set the training control
ctrl=trainControl(method="repeatedcv",number=5,repeats = 1)
# Set the parameter search grid
max_split=length(colnames(data))-1
rf_grid=expand.grid(mtry=2:max_split)
set.seed(3) # For reproducible purpose
xgbLinear=train(log(price_doc)~., data=train, method="xgbLinear", trControl=ctrl,metric="RMSE", tuneLength=10,importance=TRUE)
1 package is needed for this model and is not installed. (xgboost). Would you like to try to install it now?
1: yes
2: no
yes
Installing package into 愼㸱愼㹥C:/Users/Edward cooper/Documents/R/win-library/3.4愼㸱愼㹦
(as 愼㸱愼㹥lib愼㸱愼㹦 is unspecified)
cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/src/contrib/PACKAGES.rds': HTTP status was '404 Not Found'cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/3.4/PACKAGES.rds': HTTP status was '404 Not Found'trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/xgboost_0.6-4.zip'
Content type 'application/zip' length 1693795 bytes (1.6 MB)
downloaded 1.6 MB
package ‘xgboost’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\Edward cooper\AppData\Local\Temp\Rtmp4w3lRX\downloaded_packages
Loading required package: xgboost
Attaching package: 愼㸱愼㹥xgboost愼㸱愼㹦
The following object is masked from 愼㸱愼㹥package:dplyr愼㸱愼㹦:
slice
varImp(xgbLinear)
xgbLinear variable importance
Overall
full_sq 100.0000
timestamp 27.5204
build_year 19.0411
life_sq 11.8901
floor 10.9568
max_floor 10.0016
kitch_sq 7.4839
state 6.7344
material 4.3589
num_room 1.7140
product_typeOwnerOccupier 0.5323
sub_areaOwnerOccupier 0.0000
gc() # grabage collection
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2489206 133.0 3886542 207.6 3886542 207.6
Vcells 9648471 73.7 25856534 197.3 37429521 285.6
# Define number of cores to use in training the model
cl_num=3
# Register parallel backend
cl <- makeCluster(cl_num)
registerDoParallel(cl)
# Set the training control
ctrl=trainControl(method="repeatedcv",number=5,repeats = 1)
set.seed(3) # For reproducible purpose
xgbTree=train(log(price_doc)~., data=train, method="xgbTree", trControl=ctrl,metric="RMSE", tuneLength=10,importance=TRUE)
Loading required package: plyr
---------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
---------------------------------------------------------------------
Attaching package: 愼㸱愼㹥plyr愼㸱愼㹦
The following objects are masked from 愼㸱愼㹥package:dplyr愼㸱愼㹦:
arrange, count, desc, failwith, id, mutate, rename,
summarise, summarize
The following object is masked from 愼㸱愼㹥package:purrr愼㸱愼㹦:
compact
rmsle3
[1] 0.6226297
What I meant by shallow here is that the number of level of neural networks do not exceed 10 layers.
gc() # grabage collection
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2537445 135.6 3886542 207.6 3886542 207.6
Vcells 10631261 81.2 25856534 197.3 37429521 285.6
# Define number of cores to use in training the model
cl_num=2
# Register parallel backend
cl <- makeCluster(cl_num)
registerDoParallel(cl)
# Set the training control
ctrl=trainControl(method="repeatedcv",number=5,repeats = 1)
# parameter grid
nnet_grid=expand.grid(size=c(0.001,0.01,0.1,1,10,100),decay=c(100,10,1,0.1))
set.seed(3) # For reproducible purpose
nnet=train(log(price_doc)~., data=train, method="nnet", trControl=ctrl,metric="RMSE", tuneGrid=nnet_grid,importance=TRUE)
There were missing values in resampled performance measures.missing values found in aggregated results
# weights: 14
initial value 1190018.505689
iter 10 value 1189999.352664
iter 10 value 1189999.344892
iter 10 value 1189999.339918
final value 1189999.339918
converged
t2=proc.time()
(t2-t1)/60
用户 系统 流逝
0.3941667 0.1545000 416.2190000
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2537385 135.6 3886542 207.6 3886542 207.6
Vcells 10588913 80.8 25856534 197.3 37429521 285.6
nnet
Neural Network
5440 samples
12 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 1 times)
Summary of sample sizes: 4351, 4351, 4351, 4354, 4353
Resampling results across tuning parameters:
size decay RMSE Rsquared
1e-03 0.1 14.79000 NaN
1e-03 1.0 14.79000 NaN
1e-03 10.0 14.79000 NaN
1e-03 100.0 14.79020 7.553303e-05
1e-02 0.1 14.79020 1.115801e-04
1e-02 1.0 14.79020 1.115796e-04
1e-02 10.0 14.79020 1.115801e-04
1e-02 100.0 14.79000 NaN
1e-01 0.1 14.79020 1.115801e-04
1e-01 1.0 14.79020 1.115801e-04
1e-01 10.0 14.79020 4.219260e-04
1e-01 100.0 14.79000 2.496738e-04
1e+00 0.1 14.79002 1.115801e-04
1e+00 1.0 14.79008 1.731596e-03
1e+00 10.0 14.79058 NaN
1e+00 100.0 14.79429 NaN
1e+01 0.1 14.79003 1.778511e-02
1e+01 1.0 14.79007 6.189270e-03
1e+01 10.0 14.79020 7.685211e-03
1e+01 100.0 14.79119 2.651527e-03
1e+02 0.1 NaN NaN
1e+02 1.0 NaN NaN
1e+02 10.0 NaN NaN
1e+02 100.0 NaN NaN
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were size = 0.001 and decay = 10.
rmse4=MLmetrics::RMSE(y_pred=nnet%>%predict(test),y_true = test$price_doc)
rmsle4=RMSLE(y_pred = exp(nnet%>%predict(test)),y_true = test$price_doc)
rmse4
[1] 11192253
rmsle4
[1] 14.47386
stopImplicitCluster()
#stopCluster(cl)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2690094 143.7 4703850 251.3 3886542 207.6
Vcells 11436289 87.3 25856534 197.3 37429521 285.6
#varImp(nnet)
It seems that nnet considerably underestimated the price compared to other models. And it does not perform so well with only 5000 data points.
cordata=data%>%select(price_doc,full_sq,life_sq,kitch_sq,floor,max_floor,state,material,build_year,num_room)
library(corrplot)
corrplot(cor(cordata, use="complete.obs"))
This is the website for the package which has about 233 machine learning algorithms.
This is my blog. Take a look.