library(caret)
library(knitr)
library(MASS)
library(dplyr)
library(mice)
library(xgboost)
set.seed(1234)
training <- read.csv(file="train.csv")
testing <- read.csv(file="test.csv")
The data set has 81 variables and 1460 observations. The first variable represents a correlative number. The target variable is SalePrice which represents the property’s sale price in dollars.
missing.summary <- sapply(training, function(x) sum(is.na(x)))
indexs.missing <- sapply(training, function(x) sum(is.na(x))) > 0
num.variable.missing <- length(missing.summary[indexs.missing])
The data set has missing data and there are 19 variables that have missing data. The amount of missing data in these variables vary in the range of 1 and 1406.
freq.table.miss <- data.frame( Variable = names(missing.summary[indexs.missing]), Number.of.Missing = as.integer(missing.summary[indexs.missing]), Porcentage.of.Missing = as.numeric(prop.table(missing.summary[indexs.missing])) )
freq.table.miss <- freq.table.miss %>% select(Variable:Porcentage.of.Missing) %>% arrange(desc(Number.of.Missing))
The above table shows the number of missing values and percentage of missing values per variable. There are 6 variables with a number of missing values greater than 5% of the total, these variables will be excluded from the analysis along with the first variable.
| Variable | Number.of.Missing | Porcentage.of.Missing |
|---|---|---|
| PoolQC | 1453 | 0.2086145 |
| MiscFeature | 1406 | 0.2018665 |
| Alley | 1369 | 0.1965542 |
| Fence | 1179 | 0.1692749 |
| FireplaceQu | 690 | 0.0990668 |
| LotFrontage | 259 | 0.0371859 |
| GarageType | 81 | 0.0116296 |
| GarageYrBlt | 81 | 0.0116296 |
| GarageFinish | 81 | 0.0116296 |
| GarageQual | 81 | 0.0116296 |
| GarageCond | 81 | 0.0116296 |
| BsmtExposure | 38 | 0.0054559 |
| BsmtFinType2 | 38 | 0.0054559 |
| BsmtQual | 37 | 0.0053123 |
| BsmtCond | 37 | 0.0053123 |
| BsmtFinType1 | 37 | 0.0053123 |
| MasVnrType | 8 | 0.0011486 |
| MasVnrArea | 8 | 0.0011486 |
| Electrical | 1 | 0.0001436 |
Moreover, we perform the following tasks:
# We exclude the variables with many missing values
indexs <- missing.summary < 690
training <- training[, indexs]
# We retain SalePrice
SalePrice <- training$SalePrice
# We exclude the first variable and Sale Price
training <- training %>%
select(-Id, -SalePrice)
indexs.quantitative <- sapply(training, function(x) is.numeric(x))
# We split the train data set into quantitative variables and cualitatives variables.
training.quantitative <- training[, indexs.quantitative]
training.qualitative <- training[, !indexs.quantitative]
nzv <- nzv(training.qualitative, saveMetrics = TRUE)
kable(nzv)
| freqRatio | percentUnique | zeroVar | nzv | |
|---|---|---|---|---|
| MSZoning | 5.279816 | 0.3424658 | FALSE | FALSE |
| Street | 242.333333 | 0.1369863 | FALSE | TRUE |
| LotShape | 1.911157 | 0.2739726 | FALSE | FALSE |
| LandContour | 20.809524 | 0.2739726 | FALSE | TRUE |
| Utilities | 1459.000000 | 0.1369863 | FALSE | TRUE |
| LotConfig | 4.000000 | 0.3424658 | FALSE | FALSE |
| LandSlope | 21.261539 | 0.2054795 | FALSE | TRUE |
| Neighborhood | 1.500000 | 1.7123288 | FALSE | FALSE |
| Condition1 | 15.555556 | 0.6164384 | FALSE | FALSE |
| Condition2 | 240.833333 | 0.5479452 | FALSE | TRUE |
| BldgType | 10.701754 | 0.3424658 | FALSE | FALSE |
| HouseStyle | 1.631461 | 0.5479452 | FALSE | FALSE |
| RoofStyle | 3.989511 | 0.4109589 | FALSE | FALSE |
| RoofMatl | 130.363636 | 0.5479452 | FALSE | TRUE |
| Exterior1st | 2.319820 | 1.0273973 | FALSE | FALSE |
| Exterior2nd | 2.355140 | 1.0958904 | FALSE | FALSE |
| MasVnrType | 1.941573 | 0.2739726 | FALSE | FALSE |
| ExterQual | 1.856557 | 0.2739726 | FALSE | FALSE |
| ExterCond | 8.780822 | 0.3424658 | FALSE | FALSE |
| Foundation | 1.020505 | 0.4109589 | FALSE | FALSE |
| BsmtQual | 1.050162 | 0.2739726 | FALSE | FALSE |
| BsmtCond | 20.169231 | 0.2739726 | FALSE | TRUE |
| BsmtExposure | 4.312217 | 0.2739726 | FALSE | FALSE |
| BsmtFinType1 | 1.028708 | 0.4109589 | FALSE | FALSE |
| BsmtFinType2 | 23.259259 | 0.4109589 | FALSE | TRUE |
| Heating | 79.333333 | 0.4109589 | FALSE | TRUE |
| HeatingQC | 1.731308 | 0.3424658 | FALSE | FALSE |
| CentralAir | 14.368421 | 0.1369863 | FALSE | FALSE |
| Electrical | 14.191489 | 0.3424658 | FALSE | FALSE |
| KitchenQual | 1.254266 | 0.2739726 | FALSE | FALSE |
| Functional | 40.000000 | 0.4794521 | FALSE | TRUE |
| GarageType | 2.248062 | 0.4109589 | FALSE | FALSE |
| GarageFinish | 1.433649 | 0.2054795 | FALSE | FALSE |
| GarageQual | 27.312500 | 0.3424658 | FALSE | TRUE |
| GarageCond | 37.885714 | 0.3424658 | FALSE | TRUE |
| PavedDrive | 14.888889 | 0.2054795 | FALSE | FALSE |
| SaleType | 10.385246 | 0.6164384 | FALSE | FALSE |
| SaleCondition | 9.584000 | 0.4109589 | FALSE | FALSE |
training.qualitative <- training.qualitative[, !nzv$nzv]
nzv2 <- nzv(training.quantitative, saveMetrics = TRUE)
kable(nzv)
| freqRatio | percentUnique | zeroVar | nzv | |
|---|---|---|---|---|
| MSZoning | 5.279816 | 0.3424658 | FALSE | FALSE |
| Street | 242.333333 | 0.1369863 | FALSE | TRUE |
| LotShape | 1.911157 | 0.2739726 | FALSE | FALSE |
| LandContour | 20.809524 | 0.2739726 | FALSE | TRUE |
| Utilities | 1459.000000 | 0.1369863 | FALSE | TRUE |
| LotConfig | 4.000000 | 0.3424658 | FALSE | FALSE |
| LandSlope | 21.261539 | 0.2054795 | FALSE | TRUE |
| Neighborhood | 1.500000 | 1.7123288 | FALSE | FALSE |
| Condition1 | 15.555556 | 0.6164384 | FALSE | FALSE |
| Condition2 | 240.833333 | 0.5479452 | FALSE | TRUE |
| BldgType | 10.701754 | 0.3424658 | FALSE | FALSE |
| HouseStyle | 1.631461 | 0.5479452 | FALSE | FALSE |
| RoofStyle | 3.989511 | 0.4109589 | FALSE | FALSE |
| RoofMatl | 130.363636 | 0.5479452 | FALSE | TRUE |
| Exterior1st | 2.319820 | 1.0273973 | FALSE | FALSE |
| Exterior2nd | 2.355140 | 1.0958904 | FALSE | FALSE |
| MasVnrType | 1.941573 | 0.2739726 | FALSE | FALSE |
| ExterQual | 1.856557 | 0.2739726 | FALSE | FALSE |
| ExterCond | 8.780822 | 0.3424658 | FALSE | FALSE |
| Foundation | 1.020505 | 0.4109589 | FALSE | FALSE |
| BsmtQual | 1.050162 | 0.2739726 | FALSE | FALSE |
| BsmtCond | 20.169231 | 0.2739726 | FALSE | TRUE |
| BsmtExposure | 4.312217 | 0.2739726 | FALSE | FALSE |
| BsmtFinType1 | 1.028708 | 0.4109589 | FALSE | FALSE |
| BsmtFinType2 | 23.259259 | 0.4109589 | FALSE | TRUE |
| Heating | 79.333333 | 0.4109589 | FALSE | TRUE |
| HeatingQC | 1.731308 | 0.3424658 | FALSE | FALSE |
| CentralAir | 14.368421 | 0.1369863 | FALSE | FALSE |
| Electrical | 14.191489 | 0.3424658 | FALSE | FALSE |
| KitchenQual | 1.254266 | 0.2739726 | FALSE | FALSE |
| Functional | 40.000000 | 0.4794521 | FALSE | TRUE |
| GarageType | 2.248062 | 0.4109589 | FALSE | FALSE |
| GarageFinish | 1.433649 | 0.2054795 | FALSE | FALSE |
| GarageQual | 27.312500 | 0.3424658 | FALSE | TRUE |
| GarageCond | 37.885714 | 0.3424658 | FALSE | TRUE |
| PavedDrive | 14.888889 | 0.2054795 | FALSE | FALSE |
| SaleType | 10.385246 | 0.6164384 | FALSE | FALSE |
| SaleCondition | 9.584000 | 0.4109589 | FALSE | FALSE |
training.quantitative <- training.quantitative[, !nzv2$nzv]
tempData <- mice(training.quantitative,m=5,maxit=50,meth='pmm',seed=1234, printFlag=FALSE)
training.quantitative.imputed <- complete(tempData, 1)
pre.proc <- preProcess(training.quantitative.imputed, method = c("center", "scale", "pca"), thresh = 0.90)
training.quantitative.imputed.pc <- predict(pre.proc, training.quantitative.imputed)
To use XGBoost model we need to convert qualitative predictors to dummy variables with the following code.
dummies <- dummyVars(~ ., data=training.qualitative)
training.dummies <- as.data.frame(predict(dummies, training.qualitative))
training.imputed <- cbind(training.dummies, training.quantitative.imputed.pc)
training.imputed$SalePrice <- SalePrice
We split the data set into 2 parts: training data (90%) and testing data (20%).
inTrain <- createDataPartition(y = training.imputed$SalePrice,
p = 0.90,
list = FALSE)
num.variables <- dim(training.imputed)[2]
train.xgboost <- training.imputed[inTrain, ]
test.xgboost <- training.imputed[-inTrain, ]
The XGBoost model was fitted with the following code.
house.xgboost <- xgboost(data = data.matrix(train.xgboost[,-num.variables]),
label=data.matrix(train.xgboost[,num.variables]),
booster = "gblinear",
objective = "reg:linear",
max.depth = 20,
nround = 10000,
lambda = 0,
lambda_bias = 0,
alpha = 0,
missing=NA,
verbose = 0)
To evaluate the model according to the root-mean-squared-error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price in the testing data, we use the following code:
pred.test.xgboost <- predict(house.xgboost, data.matrix(test.xgboost), missing=NA)
rmse <- sqrt( sum( (log(pred.test.xgboost) - log(test.xgboost$SalePrice))^2 , na.rm = TRUE ) / length(pred.test.xgboost) )
The rmse between the logarithm of the predicted value and the logarithm of the observed sales price in the testing data is 0.1759052.
To generate the submission file, the following code was used:
n.indexs <- length(indexs)
indexs <- indexs[-n.indexs]
testing <- testing[, indexs]
# We retain Id
Id <- testing$Id
# We exclude the first variable
testing <- testing %>%
select(-Id)
indexs.quantitative <- sapply(testing, function(x) is.numeric(x))
# We split the train data set into quantitative variables and cualitatives variables.
testing.quantitative <- testing[, indexs.quantitative]
testing.qualitative <- testing[, !indexs.quantitative]
testing.qualitative <- testing.qualitative[, !nzv$nzv]
testing.quantitative <- testing.quantitative[, !nzv2$nzv]
tempData <- mice(testing.quantitative,m=5,maxit=50,meth='pmm',seed=1234, printFlag=FALSE)
testing.quantitative.imputed <- complete(tempData, 1)
testing.quantitative.imputed.pc <- predict(pre.proc, testing.quantitative.imputed)
testing.dummies <- as.data.frame(predict(dummies, testing.qualitative))
testing.imputed <- cbind(testing.dummies, testing.quantitative.imputed.pc)
pred.testing.xgboost <- predict(house.xgboost, data.matrix(testing.imputed), missing=NA)
submission <- data.frame(Id=Id, SalePrice=pred.testing.xgboost)
write.csv(submission, file="submission002.csv", row.names = FALSE, quote=FALSE)
The result of this submission on Kaggle was a RMSE of 0.17235, it represents an improvement of 0.11619 with respect to the simple linear model.