Introduction

An attempt to perform Exploratory Data Analysis and Practical Machine Learning for Kaggle Competition (https://www.kaggle.com/c/house-prices-advanced-regression-techniques). I take reference from many relevant contributions by Kaggle users and I appreciate all their ideas and sharing their work selflessly.


Inititalisation

  1. Loading the libraries

  2. Defining Custom Functions


Getting Data

Loading the given data into memory. Train.csv and Test.csv are provided by Kaggle. Column.csv is created by me to store preferred datatype after studying the column description given by Kaggle. Classifying the columns as numeric and categorial features. Also identifying the column (Col 1) that should be ignored.

datadir <- "./data"
trainfile <- "train.csv"
testfile <- "test.csv"
colFile <- "column.csv"

traindirfile <- paste(datadir, trainfile, sep="/")
testdirfile <- paste(datadir, testfile, sep="/")
coldirfile <- paste(datadir, colFile, sep="/")

training <- read.csv(traindirfile)
testing <- read.csv(testdirfile)
colInfo <- read.csv(coldirfile)

Cleaning Data

Combine both the training and test dataset to perform data cleaning. Mainly to:

1. Remove extreme outlier observations (if any)

# Focusing on property size/area.  Expecting a positive relationship with Sale Price
p1 <- subset(training, !is.na(GrLivArea))
p1 <- ggplot(p1, aes(GrLivArea, SalePrice)) + geom_point(color = 'blue') + theme_bw()
p2 <- subset(training, !is.na(LotArea))
p2 <- ggplot(p2, aes(LotArea, SalePrice)) + geom_point(color = 'blue') + theme_bw()
p3 <- subset(training, !is.na(LotFrontage))
p3 <- ggplot(p3, aes(LotFrontage, SalePrice)) + geom_point(color = 'blue') + theme_bw()
p4 <- subset(training, !is.na(GarageArea))
p4 <- ggplot(p4, aes(GarageArea, SalePrice)) + geom_point(color = 'blue') + theme_bw()
multiplot(p1, p2, p3, p4, cols=2)

### Remove observations that are outliers from the training set
training <- subset(training, GrLivArea < 4000 | is.na(GrLivArea))
training <- subset(training, LotArea < 100000 | is.na(LotArea))
training <- subset(training, LotFrontage < 200 | is.na(LotFrontage))
training <- subset(training, GarageArea < 1500 | is.na(GarageArea))

ggplot(training, aes(GrLivArea, SalePrice)) + geom_point(aes(color = Neighborhood)) + 
    scale_x_continuous("GrLivArea") +
    scale_y_continuous("SalePrice") +
    theme_bw() + facet_wrap( ~ Neighborhood) + theme(legend.position="none")

Positive linear relationship between GrLivArea and SalePrice observed across all neighboorhoods

2. Fix and populate NAs with values

To begin, combine the training and test dataset is necessary. This allow fixing the NAs consistently. Also to allow feature creation on both datasets later.

### Combine the dependent variables in training and test dataset 
combi <- rbind(training[,-c(ncol(training))], testing)

### Assume 0 linear feet of street connected to property if NA
combi = fixNA(combi, 'LotFrontage', 0)

### These factors contains NA values which actually mean the property has no such facility
combi <- fixNA(combi, 'MiscFeature', 'None')
combi <- fixNA(combi, 'Fence', 'None')
combi <- fixNA(combi, 'PoolQC', 'None')
combi <- fixNA(combi, 'FireplaceQu', 'None')
combi <- fixNA(combi, 'Alley', 'None')

### Unknown categorical feature set to a new category value - OTHER (more than 5 factors)
combi <- fixNA(combi, 'SaleType', 'Oth')
combi <- fixNA(combi, 'MSZoning', 'OTH')
combi <- fixNA(combi, 'Exterior1st', 'Other')
combi <- fixNA(combi, 'Exterior2nd', 'Other')
combi <- fixNA(combi, 'Functional', 'Oth')

### Unknown categorical feature set to most common categorical value
combi <- fixNA(combi, 'Utilities', modeFactor(combi, 'Utilities'))
combi <- fixNA(combi, 'Electrical', modeFactor(combi, 'Electrical'))
combi <- fixNA(combi, 'KitchenQual', modeFactor(combi, 'KitchenQual'))

### Special handling for consistency - MasVnrType and MasVnrArea (if Area is 0, Type should be None)
combi[is.na(combi$MasVnrType) & is.na(combi$MasVnrArea),]$MasVnrArea <- 0
combi[is.na(combi$MasVnrType) & combi$MasVnrArea >= 0,]$MasVnrType <- 'None'
combi[combi$MasVnrType %in% 'None' & combi$MasVnrArea >= 0,]$MasVnrArea <- 0
combi[!combi$MasVnrType %in% 'None' & combi$MasVnrArea == 0,]$MasVnrType <- 'None'

### Special handling for consistency - Garage (if Area is 0, the rest of the Garage feature should be None)
combi <- fixNA(combi, 'GarageType', 'None')
combi <- fixNA(combi, 'GarageFinish', 'None')
combi <- fixNA(combi, 'GarageQual', 'None')
combi <- fixNA(combi, 'GarageCond', 'None')
combi <- fixNA(combi, 'GarageYrBlt', '0')
combi[is.na(combi$GarageCars),]$GarageCars <- 0
combi[is.na(combi$GarageArea),]$GarageArea <- 0

combi[combi$GarageArea == 0,]$GarageType <- 'None'

combi[combi$GarageYrBlt %in% '0' & 
          combi$GarageType %in% 'Detchd' & 
          combi$GarageCars > 0 & 
          combi$GarageArea > 0,]$GarageFinish <- modeFactor(combi, 'GarageFinish')

combi[combi$GarageYrBlt %in% '0' & 
          combi$GarageType %in% 'Detchd' & 
          combi$GarageCars > 0 & 
          combi$GarageArea > 0,]$GarageQual <- modeFactor(combi, 'GarageQual')

combi[combi$GarageYrBlt %in% '0' & 
          combi$GarageType %in% 'Detchd' & 
          combi$GarageCars > 0 & 
          combi$GarageArea > 0,]$GarageCond <- modeFactor(combi, 'GarageCond')

### Special handling for consistency - Basement (if Area is 0, the rest of the Basement feature should be None)
combi <- fixNA(combi, 'BsmtQual', 'None')
combi <- fixNA(combi, 'BsmtCond', 'None')
combi <- fixNA(combi, 'BsmtExposure', 'None')
combi <- fixNA(combi, 'BsmtFinType1', 'None')
combi <- fixNA(combi, 'BsmtFinType2', 'None')

combi[is.na(combi$TotalBsmtSF),]$BsmtFinSF1 <- 0
combi[is.na(combi$TotalBsmtSF),]$BsmtFinSF2 <- 0
combi[is.na(combi$TotalBsmtSF),]$BsmtUnfSF <- 0
combi[is.na(combi$TotalBsmtSF),]$BsmtFullBath <- 0
combi[is.na(combi$TotalBsmtSF),]$BsmtHalfBath <- 0
combi[is.na(combi$TotalBsmtSF),]$TotalBsmtSF <- 0

combi[combi$TotalBsmtSF == 0,]$BsmtFullBath <- 0
combi[combi$TotalBsmtSF == 0,]$BsmtHalfBath <- 0

Data Exploratory, Feature Selection and Feature Engineering

1. Numeric Features

Feature Engineering

Create 2 potentially useful numeric features. Total area of property and total area of 1st and 2nd floor.

### Identify the numerical features
combiNum <- combi[,colInfo[colInfo$DataType=="N",]$No]

combiNum$TotalArea <- combiNum$LotFrontage + combiNum$LotArea + combiNum$MasVnrArea + combiNum$BsmtFinSF1 + 
                    combiNum$BsmtFinSF2 + combiNum$BsmtUnfSF + combiNum$TotalBsmtSF + combiNum$X1stFlrSF + 
                    combiNum$X2ndFlrSF + combiNum$GrLivArea + combiNum$GarageArea + combiNum$WoodDeckSF +
                    combiNum$OpenPorchSF + combiNum$EnclosedPorch + combiNum$X3SsnPorch + 
                    combiNum$ScreenPorch + combiNum$LowQualFinSF + combiNum$PoolArea

combiNum$TotalArea1st2nd <- combiNum$X1stFlrSF + combiNum$X2ndFlrSF

Correlation Analysis

Studying the relationship all the numeric features with Sale Price. And then, identify features that are not correlated with it.

Remove numeric features that are not correlated with Sale Price and normalise the remaining numeric features

### Remove from Correlation Analysis
combiNum <- combiNum[,!colnames(combiNum) %in% 'EnclosedPorch']
combiNum <- combiNum[,!colnames(combiNum) %in% 'LowQualFinSF']
combiNum <- combiNum[,!colnames(combiNum) %in% 'MiscVal']
combiNum <- combiNum[,!colnames(combiNum) %in% 'OpenPorchSF']
combiNum <- combiNum[,!colnames(combiNum) %in% 'PoolArea']
combiNum <- combiNum[,!colnames(combiNum) %in% 'ScreenPorch']
combiNum <- combiNum[,!colnames(combiNum) %in% 'X3SsnPorch']

### Normalise the numeric features
combiNum <- data.frame(lapply(combiNum, function(x) {log1p(x)}))

2. Categorial Features

Feature Engineering

Create new categorial feature by grouping months with high sales transactions. Months with high transactions may have an impact to Sale Price.

### Identify the categorial features
combiFac <- combi[,colInfo[colInfo$DataType=="F",]$No]
combiFac <- data.frame(lapply(combiFac, as.factor))

analysedata <- combi %>% group_by(MoSold) %>% summarise(Count = n())
analysedata$MoSold <- as.factor(analysedata$MoSold)
ggplot(data=analysedata, aes(x=MoSold, y=Count)) +
  geom_bar(stat="identity", fill="steelblue")+
  theme_minimal()

combiFac$PopularMonth <- as.factor(convertFeature(combiFac$MoSold, 
                                                c('1:0','2:0','3:0','4:1',
                                                  '5:1','6:1','7:1','8:0',
                                                  '9:0','10:0','11:0','12:0')))

Create new categorial feature by grouping types of dwellings with high transactions. Certain types of dwellings may be popular and hence have an impact to Sale Price.

analysedata <- combi %>% group_by(MSSubClass) %>% summarise(Count = n())
analysedata$MSSubClass <- as.factor(analysedata$MSSubClass)
ggplot(data=analysedata, aes(x=MSSubClass, y=Count)) +
  geom_bar(stat="identity", fill="steelblue")+
  theme_minimal()

combiFac$PopularDwelling <- as.factor(convertFeature(combiFac$MSSubClass, 
                                                     c('20:1','30:0','40:0','45:0','50:1',
                                                       '60:1','70:0','75:0','80:0','85:0',
                                                       '90:0','120:1','150:0','160:0',
                                                       '180:0','190:0')))

3. More Feature Engineering

Create other relatively straightforward and simple features

# These categorial features deserve to have a numeric scoring features
qual2Val <- c('None:0','Po:1','Fa:2','TA:3','Gd:4','Ex:5')
combiNum$ExterQual <- as.numeric(convertFeature(combiFac$ExterQual, qual2Val))
combiNum$ExterCond <- as.numeric(convertFeature(combiFac$ExterCond, qual2Val))
combiNum$BsmtQual <- as.numeric(convertFeature(combiFac$BsmtQual, qual2Val))
combiNum$BsmtCond <- as.numeric(convertFeature(combiFac$BsmtCond, qual2Val))
combiNum$HeatingQC <- as.numeric(convertFeature(combiFac$HeatingQC, qual2Val))
combiNum$KitchenQual <- as.numeric(convertFeature(combiFac$KitchenQual, qual2Val))
combiNum$FireplaceQu <- as.numeric(convertFeature(combiFac$FireplaceQu, qual2Val))
combiNum$GarageQual <- as.numeric(convertFeature(combiFac$GarageQual, qual2Val))
combiNum$GarageCond <- as.numeric(convertFeature(combiFac$GarageCond, qual2Val))
combiNum$PoolQC <- as.numeric(convertFeature(combiFac$PoolQC, qual2Val))
combiNum$BsmtExposure <- as.numeric(convertFeature(combiFac$BsmtExposure, 
                            c('None:0','No:1','Mn:2','Av:3','Gd:4')))
combiNum$BsmtFinType1 <- as.numeric(convertFeature(combiFac$BsmtFinType1, 
                            c('None:0','Unf:1','LwQ:2','Rec:3','BLQ:4','ALQ:5','GLQ:6')))
combiNum$BsmtFinType2 <- as.numeric(convertFeature(combiFac$BsmtFinType2, 
                            c('None:0','Unf:1','LwQ:2','Rec:3','BLQ:4','ALQ:5','GLQ:6')))
combiNum$Functional <- as.numeric(convertFeature(combiFac$Functional, 
                            c('Oth:0','Sal:1','Sev:2','Maj2:3','Maj1:4','Mod:5','Min2:6','Min1:7','Typ:8')))
combiNum$GarageFinish <- as.numeric(convertFeature(combiFac$GarageFinish, 
                            c('None:0','Unf:1','RFn:2','Fin:3')))
combiNum$Fence <- as.numeric(convertFeature(combiFac$Fence, 
                            c('None:0','MnWw:1','GdWo:2','MnPrv:3','GdPrv:4')))

# Tweaking scoring system of the overall condition and quality of the property
combiNum$OverallQual2 <- as.numeric(convertFeature(combiFac$OverallQual,
                                    c('1:1','2:1','3:1','4:2','5:2','6:2','7:3','8:3','9:3','10:3')))
combiNum$OverallCond2 <- as.numeric(convertFeature(combiFac$OverallCond,
                                    c('1:1','2:1','3:1','4:2','5:2','6:2','7:3','8:3','9:3','10:3')))
combiNum$OverallQual <- as.numeric(as.character(combiFac$OverallQual))
combiNum$OverallCond <- as.numeric(as.character(combiFac$OverallCond))

# Is there a reason the price is a discounted price?
combiFac$SaleNormal <- binaryFeature(combiFac$SaleCondition, c('Normal','Partial'))

# Is it an uncompleted sale?
combiFac$SaleComplete <- binaryFeature(combiFac$SaleCondition, 'Partial')

# Is the shape regular?
combiFac$IsRegularShape <- binaryFeature(combiFac$LotShape, 'Reg')

# Is Prime Zone?
combiFac$IsPrimeZone <- binaryFeature(combiFac$MSZoning, c('FV','RL','RP'))

# Is Normal Zone?
combiFac$IsNormalZone <- binaryFeature(combiFac$MSZoning, c('RH','RM'))

## Group Neighborhood based on mean Sale Price
combiNum$NeighborhoodBin <- as.numeric(convertFeature(combiFac$Neighborhood,
                                c('MeadowV:0','IDOTRR:1','BrDale:1','OldTown:1','Edwards:1',
                                  'BrkSide:1','Sawyer:1','Blueste:1','SWISU:2','NAmes:2',
                                  'NPkVill:2','Mitchel:2','SawyerW:2','Gilbert:2','NWAmes:2',
                                  'Blmngtn:2','CollgCr:2','ClearCr:3','Crawfor:3',
                                  'Veenker:3','Somerst:3','Timber:3','StoneBr:4','NoRidge:4','NridgHt:4')))

# How old is the property when it was sold?
combiNum$Age <- as.numeric(as.character(combiFac$YrSold)) - as.numeric(as.character(combiFac$YearBuilt))

# Convert these categorial feature to numeric because there is an expectation that higher the count, higher the Sale Price 
combiNum$BsmtBath <- as.numeric(as.character(combiFac$BsmtFullBath)) + as.numeric(as.character(combiFac$BsmtHalfBath)) * 0.5
combiNum$Bath <- as.numeric(as.character(combiFac$FullBath)) + as.numeric(as.character(combiFac$HalfBath)) * 0.5
combiNum$TotalBath <- combiNum$BsmtBath + combiNum$Bath
combiNum$BedroomAbvGr <- as.numeric(as.character(combiFac$BedroomAbvGr))
combiNum$KitchenAbvGr <- as.numeric(as.character(combiFac$KitchenAbvGr))
combiNum$TotRmsAbvGrd <- as.numeric(as.character(combiFac$TotRmsAbvGrd))
combiNum$Fireplaces <- as.numeric(as.character(combiFac$Fireplaces))
combiNum$GarageCars <- as.numeric(as.character(combiFac$GarageCars))

# Combining the Month and Year.  Creating a timestamp feature for each transaction.
combiNum$DtSold <- as.numeric(as.character(combiFac$YrSold)) + as.numeric(as.character(combiFac$MoSold))/12
combiNum$DtSold <- normalize(combiNum$DtSold)

4. Feature Elimination

With new features created, deleting some features that may no longer be useful.

combiFac <- combiFac[,!colnames(combiFac) %in% 'LotShape']
combiFac <- combiFac[,!colnames(combiFac) %in% 'LandContour']
combiFac <- combiFac[,!colnames(combiFac) %in% 'LandSlope']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Electrical']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageType']
combiFac <- combiFac[,!colnames(combiFac) %in% 'PavedDrive']
combiFac <- combiFac[,!colnames(combiFac) %in% 'MiscFeature']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Neighborhood']
combiFac <- combiFac[,!colnames(combiFac) %in% 'MSSubClass']
combiFac <- combiFac[,!colnames(combiFac) %in% 'YrSold']
combiFac <- combiFac[,!colnames(combiFac) %in% 'MoSold']
combiFac <- combiFac[,!colnames(combiFac) %in% 'YearRemodAdd']
combiFac <- combiFac[,!colnames(combiFac) %in% 'YearBuilt']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtFullBath']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtHalfBath']
combiFac <- combiFac[,!colnames(combiFac) %in% 'FullBath']
combiFac <- combiFac[,!colnames(combiFac) %in% 'HalfBath']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BedroomAbvGr']
combiFac <- combiFac[,!colnames(combiFac) %in% 'KitchenAbvGr']
combiFac <- combiFac[,!colnames(combiFac) %in% 'TotRmsAbvGrd']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Fireplaces']
combiFac <- combiFac[,!colnames(combiFac) %in% 'OverallQual']
combiFac <- combiFac[,!colnames(combiFac) %in% 'OverallCond']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtQual']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtCond']
combiFac <- combiFac[,!colnames(combiFac) %in% 'HeatingQC']
combiFac <- combiFac[,!colnames(combiFac) %in% 'KitchenQual']
combiFac <- combiFac[,!colnames(combiFac) %in% 'FireplaceQu']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageQual']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageCond']
combiFac <- combiFac[,!colnames(combiFac) %in% 'PoolQC']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtExposure']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtFinType1']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtFinType2']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Functional']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageFinish']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Fence']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageYrBlt']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageCars']

combiNum <- combiNum[,!colnames(combiNum) %in% 'X2ndFlrSF']
combiNum <- combiNum[,!colnames(combiNum) %in% 'MasVnrArea']
combiNum <- combiNum[,!colnames(combiNum) %in% 'WoodDeckSF']

Final Data Preparation

Getting the features ready for Machine Learning

Recombine the numeric and categorial features and splitting the dataset back to the training and test dataset. Normalised Sale Price is added back to the training set.

combi2 <- cbind(combiNum,combiFac)

train <- cbind(combi2[1:nrow(training),],SalePrice = training[,c(ncol(training))])
test <- combi2[(nrow(training)+1):nrow(combi2),]
train$SalePrice <- log1p(train$SalePrice)

Data Slicing

Splitting the training data into 2 sub dataset (80/20). One for training the model. The other for out of sample validation.

# Data splitting the training set into 2 subset.  
# One for training the model.  One for out of sample validation.
inTrain <- createDataPartition(y=train$SalePrice, p=0.80, list=FALSE)
validation.data <- train[-inTrain,]
train.data <- train[inTrain,]

traindata <- data.table(train.data)
validationdata <- data.table(validation.data)

Training the Model

Using XGBOOST

Training the model against different combinations of input parameters

set.seed(3567)
numCol <- ncol(traindata)-1

trainx <- Matrix(data.matrix(traindata[,c(1:numCol),with=FALSE]), sparse=TRUE)
trainy <- as.numeric(traindata$SalePrice)
inputValid <- Matrix(data.matrix(validationdata[,c(1:numCol),with=FALSE]), sparse=TRUE)

xgbGrid <- expand.grid(
  nrounds = c(10000),
  max_depth = seq(3,6,by=1),
  eta = seq(0.03,0.05,by=0.01),
  gamma = seq(0,1,by=1),
  colsample_bytree = seq(0.4,0.6,by = 0.1),
  min_child_weight = seq(1,1,by = 0.5),
  subsample = seq(0.4,0.6,by = 0.1)
)
 
rmseErrorsHyperparameters <- apply(xgbGrid, 1, function(parameterList){

    #Extract Parameters to test
    currentSubsampleRate <- parameterList[["subsample"]]
    currentColsampleRate <- parameterList[["colsample_bytree"]]
    currentMin_Child_Weight <- parameterList[["min_child_weight"]]
    currentGamma <- parameterList[["gamma"]]
    currentEta <- parameterList[["eta"]]
    currentMax_Depth <- parameterList[["max_depth"]]
    currentNrounds <- parameterList[["nrounds"]]

    params <- list(objective = "reg:linear", 
               #booster = "gbtree", 
               #eta = 2/currentNrounds,
               eta = currentEta, 
               gamma = currentGamma, 
               max_depth = currentMax_Depth, 
               min_child_weight = currentMin_Child_Weight, 
               subsample = currentSubsampleRate, 
               colsample_bytree = currentColsampleRate)

    xgbcv <- xgb.cv(params = params, 
                    data = trainx, label = trainy,
                    nrounds = currentNrounds, nfold = 5, 
                    showsd = T, stratified = T, early_stopping_rounds = 20, maximize = F)

    testrmse <- xgbcv$evaluation_log$test_rmse_mean[xgbcv$best_iteration]
    trainrmse <- xgbcv$evaluation_log$train_rmse_mean[xgbcv$best_iteration]

    return(c(testrmse, trainrmse, currentSubsampleRate, currentColsampleRate,
             currentMin_Child_Weight,currentGamma,currentEta,
             currentMax_Depth,currentNrounds,xgbcv$best_iteration))

})

Selecting the parameters

Chosing the best parameters to use that achieved the lowest RMSE

simTrain <- as.data.frame(t(rmseErrorsHyperparameters))
colnames(simTrain) <- c('TestRMSE','TrainRMSE','SubSampleRate','ColSampleRate',
                         'MinChildWgt','Gamma','ETA','MaxDepth','NRound', 'Iteration')
simTrain$Diff <- simTrain$TestRMSE - simTrain$TrainRMSE
bestTrain <- simTrain[simTrain$TestRMSE == min(simTrain$TestRMSE),]
bestTrain
##     TestRMSE TrainRMSE SubSampleRate ColSampleRate MinChildWgt Gamma  ETA
## 73 0.1150474 0.0638832           0.5           0.4           1     0 0.03
##    MaxDepth NRound Iteration      Diff
## 73        3  10000       623 0.0511642

Validating the model

Testing the best model against the validation dataset

params <- list(objective = "reg:linear", 
               #booster = "gbtree", 
               eta = bestTrain$ETA, 
               gamma = bestTrain$Gamma, 
               max_depth = bestTrain$MaxDepth, 
               min_child_weight = bestTrain$MinChildWgt, 
               subsample = bestTrain$SubSampleRate, 
               colsample_bytree = bestTrain$ColSampleRate
               )

xgmodel <- xgboost(params = params, data = trainx, label = trainy, nround = bestTrain$Iteration)

inputValid <- Matrix(data.matrix(validationdata[,c(1:numCol),with=FALSE]), sparse=TRUE)
xgBoostValidation <- predict(xgmodel,inputValid) 
predicted <- data.frame(SalePrice=validation.data$SalePrice, Prediction=xgBoostValidation)

Plotting the predicted results against the validation dataset and display the important features

ggplot(predicted, aes(x=SalePrice, y=Prediction)) +
    geom_point(shape=1) +    # Use hollow circles
    geom_smooth(method=lm)   # Add linear regression line 

mat <- xgb.importance (feature_names = colnames(trainx),model = xgmodel)
xgb.plot.importance (importance_matrix = mat[1:20]) 


Making a Prediction

Making a XGBoost prediction and creating the submission file for Kaggle

#PREPARE

testid <- testing$Id
result <- data.frame(id = testid)

#XGBOOST PREDICTION

testdata <- data.table(test)
inputTest <- Matrix(data.matrix(testdata[,c(1:numCol),with=FALSE]), sparse=TRUE)
xgBoostTest <- predict(xgmodel,inputTest) 

# RESULT for SUBMISSION

result$SalePrice <- expm1(xgBoostTest)
write.csv(result[,c(1,ncol(result))], "submission.csv", row.names = FALSE)

Conclusion

This is my elementary attempt on practical machine learning. Let me know if I made a mistake and give me feedback how I could improve the model. Thank You!