Loading packages and setting the random seed

library(caret)
library(knitr)
library(MASS)
library(dplyr)
library(mice)
library(xgboost)

set.seed(1234)

Loading the training and testing data sets

training <- read.csv(file="train.csv")

testing <- read.csv(file="test.csv")

Data Cleaning and transformations

The data set has 81 variables and 1460 observations. The first variable represents a correlative number. The target variable is SalePrice which represents the property’s sale price in dollars.

missing.summary <- sapply(training, function(x) sum(is.na(x))) 

indexs.missing <- sapply(training, function(x) sum(is.na(x))) > 0 

num.variable.missing <- length(missing.summary[indexs.missing])

The data set has missing data and there are 19 variables that have missing data. The amount of missing data in these variables vary in the range of 1 and 1406.

freq.table.miss <- data.frame( Variable = names(missing.summary[indexs.missing]), Number.of.Missing = as.integer(missing.summary[indexs.missing]), Porcentage.of.Missing = as.numeric(prop.table(missing.summary[indexs.missing])) )

freq.table.miss <- freq.table.miss %>% select(Variable:Porcentage.of.Missing) %>% arrange(desc(Number.of.Missing))

The above table shows the number of missing values and percentage of missing values per variable. There are 6 variables with a number of missing values greater than 5% of the total, these variables will be excluded from the analysis along with the first variable.

Variable Number.of.Missing Porcentage.of.Missing
PoolQC 1453 0.2086145
MiscFeature 1406 0.2018665
Alley 1369 0.1965542
Fence 1179 0.1692749
FireplaceQu 690 0.0990668
LotFrontage 259 0.0371859
GarageType 81 0.0116296
GarageYrBlt 81 0.0116296
GarageFinish 81 0.0116296
GarageQual 81 0.0116296
GarageCond 81 0.0116296
BsmtExposure 38 0.0054559
BsmtFinType2 38 0.0054559
BsmtQual 37 0.0053123
BsmtCond 37 0.0053123
BsmtFinType1 37 0.0053123
MasVnrType 8 0.0011486
MasVnrArea 8 0.0011486
Electrical 1 0.0001436

Moreover, we perform the following tasks:

# We exclude the variables with many missing values

indexs <- missing.summary < 690 

training <- training[, indexs]

# We retain SalePrice 

SalePrice <- training$SalePrice

# We exclude the first variable and Sale Price 

training <- training %>% 
            select(-Id, -SalePrice)

indexs.quantitative <- sapply(training, function(x) is.numeric(x))

# We split the train data set into quantitative variables and cualitatives variables.

training.quantitative <- training[, indexs.quantitative]

training.qualitative <- training[, !indexs.quantitative]

nzv <- nzv(training.qualitative, saveMetrics = TRUE)

kable(nzv)
freqRatio percentUnique zeroVar nzv
MSZoning 5.279816 0.3424658 FALSE FALSE
Street 242.333333 0.1369863 FALSE TRUE
LotShape 1.911157 0.2739726 FALSE FALSE
LandContour 20.809524 0.2739726 FALSE TRUE
Utilities 1459.000000 0.1369863 FALSE TRUE
LotConfig 4.000000 0.3424658 FALSE FALSE
LandSlope 21.261539 0.2054795 FALSE TRUE
Neighborhood 1.500000 1.7123288 FALSE FALSE
Condition1 15.555556 0.6164384 FALSE FALSE
Condition2 240.833333 0.5479452 FALSE TRUE
BldgType 10.701754 0.3424658 FALSE FALSE
HouseStyle 1.631461 0.5479452 FALSE FALSE
RoofStyle 3.989511 0.4109589 FALSE FALSE
RoofMatl 130.363636 0.5479452 FALSE TRUE
Exterior1st 2.319820 1.0273973 FALSE FALSE
Exterior2nd 2.355140 1.0958904 FALSE FALSE
MasVnrType 1.941573 0.2739726 FALSE FALSE
ExterQual 1.856557 0.2739726 FALSE FALSE
ExterCond 8.780822 0.3424658 FALSE FALSE
Foundation 1.020505 0.4109589 FALSE FALSE
BsmtQual 1.050162 0.2739726 FALSE FALSE
BsmtCond 20.169231 0.2739726 FALSE TRUE
BsmtExposure 4.312217 0.2739726 FALSE FALSE
BsmtFinType1 1.028708 0.4109589 FALSE FALSE
BsmtFinType2 23.259259 0.4109589 FALSE TRUE
Heating 79.333333 0.4109589 FALSE TRUE
HeatingQC 1.731308 0.3424658 FALSE FALSE
CentralAir 14.368421 0.1369863 FALSE FALSE
Electrical 14.191489 0.3424658 FALSE FALSE
KitchenQual 1.254266 0.2739726 FALSE FALSE
Functional 40.000000 0.4794521 FALSE TRUE
GarageType 2.248062 0.4109589 FALSE FALSE
GarageFinish 1.433649 0.2054795 FALSE FALSE
GarageQual 27.312500 0.3424658 FALSE TRUE
GarageCond 37.885714 0.3424658 FALSE TRUE
PavedDrive 14.888889 0.2054795 FALSE FALSE
SaleType 10.385246 0.6164384 FALSE FALSE
SaleCondition 9.584000 0.4109589 FALSE FALSE
training.qualitative <- training.qualitative[, !nzv$nzv]

nzv2 <- nzv(training.quantitative, saveMetrics = TRUE)

kable(nzv)
freqRatio percentUnique zeroVar nzv
MSZoning 5.279816 0.3424658 FALSE FALSE
Street 242.333333 0.1369863 FALSE TRUE
LotShape 1.911157 0.2739726 FALSE FALSE
LandContour 20.809524 0.2739726 FALSE TRUE
Utilities 1459.000000 0.1369863 FALSE TRUE
LotConfig 4.000000 0.3424658 FALSE FALSE
LandSlope 21.261539 0.2054795 FALSE TRUE
Neighborhood 1.500000 1.7123288 FALSE FALSE
Condition1 15.555556 0.6164384 FALSE FALSE
Condition2 240.833333 0.5479452 FALSE TRUE
BldgType 10.701754 0.3424658 FALSE FALSE
HouseStyle 1.631461 0.5479452 FALSE FALSE
RoofStyle 3.989511 0.4109589 FALSE FALSE
RoofMatl 130.363636 0.5479452 FALSE TRUE
Exterior1st 2.319820 1.0273973 FALSE FALSE
Exterior2nd 2.355140 1.0958904 FALSE FALSE
MasVnrType 1.941573 0.2739726 FALSE FALSE
ExterQual 1.856557 0.2739726 FALSE FALSE
ExterCond 8.780822 0.3424658 FALSE FALSE
Foundation 1.020505 0.4109589 FALSE FALSE
BsmtQual 1.050162 0.2739726 FALSE FALSE
BsmtCond 20.169231 0.2739726 FALSE TRUE
BsmtExposure 4.312217 0.2739726 FALSE FALSE
BsmtFinType1 1.028708 0.4109589 FALSE FALSE
BsmtFinType2 23.259259 0.4109589 FALSE TRUE
Heating 79.333333 0.4109589 FALSE TRUE
HeatingQC 1.731308 0.3424658 FALSE FALSE
CentralAir 14.368421 0.1369863 FALSE FALSE
Electrical 14.191489 0.3424658 FALSE FALSE
KitchenQual 1.254266 0.2739726 FALSE FALSE
Functional 40.000000 0.4794521 FALSE TRUE
GarageType 2.248062 0.4109589 FALSE FALSE
GarageFinish 1.433649 0.2054795 FALSE FALSE
GarageQual 27.312500 0.3424658 FALSE TRUE
GarageCond 37.885714 0.3424658 FALSE TRUE
PavedDrive 14.888889 0.2054795 FALSE FALSE
SaleType 10.385246 0.6164384 FALSE FALSE
SaleCondition 9.584000 0.4109589 FALSE FALSE
training.quantitative <- training.quantitative[, !nzv2$nzv]

tempData <- mice(training.quantitative,m=5,maxit=50,meth='pmm',seed=1234, printFlag=FALSE)

training.quantitative.imputed <- complete(tempData, 1)

pre.proc <- preProcess(training.quantitative.imputed, method = c("center", "scale", "pca"), thresh = 0.90)

training.quantitative.imputed.pc <- predict(pre.proc, training.quantitative.imputed)

Fitting and running XGBoost model

To use XGBoost model we need to convert qualitative predictors to dummy variables with the following code.

dummies <- dummyVars(~ ., data=training.qualitative)
training.dummies <- as.data.frame(predict(dummies, training.qualitative))

training.imputed <- cbind(training.dummies, training.quantitative.imputed.pc)

training.imputed$SalePrice <- SalePrice 

We split the data set into 2 parts: training data (90%) and testing data (20%).

inTrain <- createDataPartition(y = training.imputed$SalePrice, 
                               p = 0.90, 
                               list = FALSE)

num.variables <- dim(training.imputed)[2]

train.xgboost <- training.imputed[inTrain, ]

test.xgboost <- training.imputed[-inTrain, ]

The XGBoost model was fitted with the following code.

house.xgboost <- xgboost(data = data.matrix(train.xgboost[,-num.variables]),
                         label=data.matrix(train.xgboost[,num.variables]),
                 booster = "gblinear",
                 objective = "reg:linear",
                 max.depth = 20,
                 nround = 10000,
                 lambda = 0,
                 lambda_bias = 0,
                 alpha = 0, 
                 missing=NA, 
                 verbose = 0)

To evaluate the model according to the root-mean-squared-error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price in the testing data, we use the following code:

pred.test.xgboost <- predict(house.xgboost, data.matrix(test.xgboost), missing=NA)

rmse <- sqrt( sum( (log(pred.test.xgboost) - log(test.xgboost$SalePrice))^2 , na.rm = TRUE ) / length(pred.test.xgboost) )

The rmse between the logarithm of the predicted value and the logarithm of the observed sales price in the testing data is 0.1759052.

Predicting with the XGBoost model

To generate the submission file, the following code was used:

n.indexs <- length(indexs)
indexs <- indexs[-n.indexs]

testing <- testing[, indexs]

# We retain Id 

Id <- testing$Id

# We exclude the first variable

testing <- testing %>%
            select(-Id)

indexs.quantitative <- sapply(testing, function(x) is.numeric(x))

# We split the train data set into quantitative variables and cualitatives variables.

testing.quantitative <- testing[, indexs.quantitative]

testing.qualitative <- testing[, !indexs.quantitative]

testing.qualitative <- testing.qualitative[, !nzv$nzv]

testing.quantitative <- testing.quantitative[, !nzv2$nzv]

tempData <- mice(testing.quantitative,m=5,maxit=50,meth='pmm',seed=1234, printFlag=FALSE)

testing.quantitative.imputed <- complete(tempData, 1)

testing.quantitative.imputed.pc <- predict(pre.proc, testing.quantitative.imputed)

testing.dummies <- as.data.frame(predict(dummies, testing.qualitative))

testing.imputed <- cbind(testing.dummies, testing.quantitative.imputed.pc)

pred.testing.xgboost <- predict(house.xgboost, data.matrix(testing.imputed), missing=NA)

submission <- data.frame(Id=Id, SalePrice=pred.testing.xgboost)
write.csv(submission, file="submission002.csv", row.names = FALSE, quote=FALSE)

The result of this submission on Kaggle was a RMSE of 0.17235, it represents an improvement of 0.11619 with respect to the simple linear model.