An attempt to perform Exploratory Data Analysis and Practical Machine Learning for Kaggle Competition (https://www.kaggle.com/c/house-prices-advanced-regression-techniques). I take reference from many relevant contributions by Kaggle users and I appreciate all their ideas and sharing their work selflessly.
Loading the libraries
Defining Custom Functions
Loading the given data into memory. Train.csv and Test.csv are provided by Kaggle. Column.csv is created by me to store preferred datatype after studying the column description given by Kaggle. Classifying the columns as numeric and categorial features. Also identifying the column (Col 1) that should be ignored.
datadir <- "./data"
trainfile <- "train.csv"
testfile <- "test.csv"
colFile <- "column.csv"
traindirfile <- paste(datadir, trainfile, sep="/")
testdirfile <- paste(datadir, testfile, sep="/")
coldirfile <- paste(datadir, colFile, sep="/")
training <- read.csv(traindirfile)
testing <- read.csv(testdirfile)
colInfo <- read.csv(coldirfile)
Combine both the training and test dataset to perform data cleaning. Mainly to:
# Focusing on property size/area. Expecting a positive relationship with Sale Price
p1 <- subset(training, !is.na(GrLivArea))
p1 <- ggplot(p1, aes(GrLivArea, SalePrice)) + geom_point(color = 'blue') + theme_bw()
p2 <- subset(training, !is.na(LotArea))
p2 <- ggplot(p2, aes(LotArea, SalePrice)) + geom_point(color = 'blue') + theme_bw()
p3 <- subset(training, !is.na(LotFrontage))
p3 <- ggplot(p3, aes(LotFrontage, SalePrice)) + geom_point(color = 'blue') + theme_bw()
p4 <- subset(training, !is.na(GarageArea))
p4 <- ggplot(p4, aes(GarageArea, SalePrice)) + geom_point(color = 'blue') + theme_bw()
multiplot(p1, p2, p3, p4, cols=2)
### Remove observations that are outliers from the training set
training <- subset(training, GrLivArea < 4000 | is.na(GrLivArea))
training <- subset(training, LotArea < 100000 | is.na(LotArea))
training <- subset(training, LotFrontage < 200 | is.na(LotFrontage))
training <- subset(training, GarageArea < 1500 | is.na(GarageArea))
ggplot(training, aes(GrLivArea, SalePrice)) + geom_point(aes(color = Neighborhood)) +
scale_x_continuous("GrLivArea") +
scale_y_continuous("SalePrice") +
theme_bw() + facet_wrap( ~ Neighborhood) + theme(legend.position="none")
Positive linear relationship between GrLivArea and SalePrice observed across all neighboorhoods
To begin, combine the training and test dataset is necessary. This allow fixing the NAs consistently. Also to allow feature creation on both datasets later.
### Combine the dependent variables in training and test dataset
combi <- rbind(training[,-c(ncol(training))], testing)
### Assume 0 linear feet of street connected to property if NA
combi = fixNA(combi, 'LotFrontage', 0)
### These factors contains NA values which actually mean the property has no such facility
combi <- fixNA(combi, 'MiscFeature', 'None')
combi <- fixNA(combi, 'Fence', 'None')
combi <- fixNA(combi, 'PoolQC', 'None')
combi <- fixNA(combi, 'FireplaceQu', 'None')
combi <- fixNA(combi, 'Alley', 'None')
### Unknown categorical feature set to a new category value - OTHER (more than 5 factors)
combi <- fixNA(combi, 'SaleType', 'Oth')
combi <- fixNA(combi, 'MSZoning', 'OTH')
combi <- fixNA(combi, 'Exterior1st', 'Other')
combi <- fixNA(combi, 'Exterior2nd', 'Other')
combi <- fixNA(combi, 'Functional', 'Oth')
### Unknown categorical feature set to most common categorical value
combi <- fixNA(combi, 'Utilities', modeFactor(combi, 'Utilities'))
combi <- fixNA(combi, 'Electrical', modeFactor(combi, 'Electrical'))
combi <- fixNA(combi, 'KitchenQual', modeFactor(combi, 'KitchenQual'))
### Special handling for consistency - MasVnrType and MasVnrArea (if Area is 0, Type should be None)
combi[is.na(combi$MasVnrType) & is.na(combi$MasVnrArea),]$MasVnrArea <- 0
combi[is.na(combi$MasVnrType) & combi$MasVnrArea >= 0,]$MasVnrType <- 'None'
combi[combi$MasVnrType %in% 'None' & combi$MasVnrArea >= 0,]$MasVnrArea <- 0
combi[!combi$MasVnrType %in% 'None' & combi$MasVnrArea == 0,]$MasVnrType <- 'None'
### Special handling for consistency - Garage (if Area is 0, the rest of the Garage feature should be None)
combi <- fixNA(combi, 'GarageType', 'None')
combi <- fixNA(combi, 'GarageFinish', 'None')
combi <- fixNA(combi, 'GarageQual', 'None')
combi <- fixNA(combi, 'GarageCond', 'None')
combi <- fixNA(combi, 'GarageYrBlt', '0')
combi[is.na(combi$GarageCars),]$GarageCars <- 0
combi[is.na(combi$GarageArea),]$GarageArea <- 0
combi[combi$GarageArea == 0,]$GarageType <- 'None'
combi[combi$GarageYrBlt %in% '0' &
combi$GarageType %in% 'Detchd' &
combi$GarageCars > 0 &
combi$GarageArea > 0,]$GarageFinish <- modeFactor(combi, 'GarageFinish')
combi[combi$GarageYrBlt %in% '0' &
combi$GarageType %in% 'Detchd' &
combi$GarageCars > 0 &
combi$GarageArea > 0,]$GarageQual <- modeFactor(combi, 'GarageQual')
combi[combi$GarageYrBlt %in% '0' &
combi$GarageType %in% 'Detchd' &
combi$GarageCars > 0 &
combi$GarageArea > 0,]$GarageCond <- modeFactor(combi, 'GarageCond')
### Special handling for consistency - Basement (if Area is 0, the rest of the Basement feature should be None)
combi <- fixNA(combi, 'BsmtQual', 'None')
combi <- fixNA(combi, 'BsmtCond', 'None')
combi <- fixNA(combi, 'BsmtExposure', 'None')
combi <- fixNA(combi, 'BsmtFinType1', 'None')
combi <- fixNA(combi, 'BsmtFinType2', 'None')
combi[is.na(combi$TotalBsmtSF),]$BsmtFinSF1 <- 0
combi[is.na(combi$TotalBsmtSF),]$BsmtFinSF2 <- 0
combi[is.na(combi$TotalBsmtSF),]$BsmtUnfSF <- 0
combi[is.na(combi$TotalBsmtSF),]$BsmtFullBath <- 0
combi[is.na(combi$TotalBsmtSF),]$BsmtHalfBath <- 0
combi[is.na(combi$TotalBsmtSF),]$TotalBsmtSF <- 0
combi[combi$TotalBsmtSF == 0,]$BsmtFullBath <- 0
combi[combi$TotalBsmtSF == 0,]$BsmtHalfBath <- 0
Create 2 potentially useful numeric features. Total area of property and total area of 1st and 2nd floor.
### Identify the numerical features
combiNum <- combi[,colInfo[colInfo$DataType=="N",]$No]
combiNum$TotalArea <- combiNum$LotFrontage + combiNum$LotArea + combiNum$MasVnrArea + combiNum$BsmtFinSF1 +
combiNum$BsmtFinSF2 + combiNum$BsmtUnfSF + combiNum$TotalBsmtSF + combiNum$X1stFlrSF +
combiNum$X2ndFlrSF + combiNum$GrLivArea + combiNum$GarageArea + combiNum$WoodDeckSF +
combiNum$OpenPorchSF + combiNum$EnclosedPorch + combiNum$X3SsnPorch +
combiNum$ScreenPorch + combiNum$LowQualFinSF + combiNum$PoolArea
combiNum$TotalArea1st2nd <- combiNum$X1stFlrSF + combiNum$X2ndFlrSF
Studying the relationship all the numeric features with Sale Price. And then, identify features that are not correlated with it.
Remove numeric features that are not correlated with Sale Price and normalise the remaining numeric features
### Remove from Correlation Analysis
combiNum <- combiNum[,!colnames(combiNum) %in% 'EnclosedPorch']
combiNum <- combiNum[,!colnames(combiNum) %in% 'LowQualFinSF']
combiNum <- combiNum[,!colnames(combiNum) %in% 'MiscVal']
combiNum <- combiNum[,!colnames(combiNum) %in% 'OpenPorchSF']
combiNum <- combiNum[,!colnames(combiNum) %in% 'PoolArea']
combiNum <- combiNum[,!colnames(combiNum) %in% 'ScreenPorch']
combiNum <- combiNum[,!colnames(combiNum) %in% 'X3SsnPorch']
### Normalise the numeric features
combiNum <- data.frame(lapply(combiNum, function(x) {log1p(x)}))
Create new categorial feature by grouping months with high sales transactions. Months with high transactions may have an impact to Sale Price.
### Identify the categorial features
combiFac <- combi[,colInfo[colInfo$DataType=="F",]$No]
combiFac <- data.frame(lapply(combiFac, as.factor))
analysedata <- combi %>% group_by(MoSold) %>% summarise(Count = n())
analysedata$MoSold <- as.factor(analysedata$MoSold)
ggplot(data=analysedata, aes(x=MoSold, y=Count)) +
geom_bar(stat="identity", fill="steelblue")+
theme_minimal()
combiFac$PopularMonth <- as.factor(convertFeature(combiFac$MoSold,
c('1:0','2:0','3:0','4:1',
'5:1','6:1','7:1','8:0',
'9:0','10:0','11:0','12:0')))
Create new categorial feature by grouping types of dwellings with high transactions. Certain types of dwellings may be popular and hence have an impact to Sale Price.
analysedata <- combi %>% group_by(MSSubClass) %>% summarise(Count = n())
analysedata$MSSubClass <- as.factor(analysedata$MSSubClass)
ggplot(data=analysedata, aes(x=MSSubClass, y=Count)) +
geom_bar(stat="identity", fill="steelblue")+
theme_minimal()
combiFac$PopularDwelling <- as.factor(convertFeature(combiFac$MSSubClass,
c('20:1','30:0','40:0','45:0','50:1',
'60:1','70:0','75:0','80:0','85:0',
'90:0','120:1','150:0','160:0',
'180:0','190:0')))
Create other relatively straightforward and simple features
# These categorial features deserve to have a numeric scoring features
qual2Val <- c('None:0','Po:1','Fa:2','TA:3','Gd:4','Ex:5')
combiNum$ExterQual <- as.numeric(convertFeature(combiFac$ExterQual, qual2Val))
combiNum$ExterCond <- as.numeric(convertFeature(combiFac$ExterCond, qual2Val))
combiNum$BsmtQual <- as.numeric(convertFeature(combiFac$BsmtQual, qual2Val))
combiNum$BsmtCond <- as.numeric(convertFeature(combiFac$BsmtCond, qual2Val))
combiNum$HeatingQC <- as.numeric(convertFeature(combiFac$HeatingQC, qual2Val))
combiNum$KitchenQual <- as.numeric(convertFeature(combiFac$KitchenQual, qual2Val))
combiNum$FireplaceQu <- as.numeric(convertFeature(combiFac$FireplaceQu, qual2Val))
combiNum$GarageQual <- as.numeric(convertFeature(combiFac$GarageQual, qual2Val))
combiNum$GarageCond <- as.numeric(convertFeature(combiFac$GarageCond, qual2Val))
combiNum$PoolQC <- as.numeric(convertFeature(combiFac$PoolQC, qual2Val))
combiNum$BsmtExposure <- as.numeric(convertFeature(combiFac$BsmtExposure,
c('None:0','No:1','Mn:2','Av:3','Gd:4')))
combiNum$BsmtFinType1 <- as.numeric(convertFeature(combiFac$BsmtFinType1,
c('None:0','Unf:1','LwQ:2','Rec:3','BLQ:4','ALQ:5','GLQ:6')))
combiNum$BsmtFinType2 <- as.numeric(convertFeature(combiFac$BsmtFinType2,
c('None:0','Unf:1','LwQ:2','Rec:3','BLQ:4','ALQ:5','GLQ:6')))
combiNum$Functional <- as.numeric(convertFeature(combiFac$Functional,
c('Oth:0','Sal:1','Sev:2','Maj2:3','Maj1:4','Mod:5','Min2:6','Min1:7','Typ:8')))
combiNum$GarageFinish <- as.numeric(convertFeature(combiFac$GarageFinish,
c('None:0','Unf:1','RFn:2','Fin:3')))
combiNum$Fence <- as.numeric(convertFeature(combiFac$Fence,
c('None:0','MnWw:1','GdWo:2','MnPrv:3','GdPrv:4')))
# Tweaking scoring system of the overall condition and quality of the property
combiNum$OverallQual2 <- as.numeric(convertFeature(combiFac$OverallQual,
c('1:1','2:1','3:1','4:2','5:2','6:2','7:3','8:3','9:3','10:3')))
combiNum$OverallCond2 <- as.numeric(convertFeature(combiFac$OverallCond,
c('1:1','2:1','3:1','4:2','5:2','6:2','7:3','8:3','9:3','10:3')))
combiNum$OverallQual <- as.numeric(as.character(combiFac$OverallQual))
combiNum$OverallCond <- as.numeric(as.character(combiFac$OverallCond))
# Is there a reason the price is a discounted price?
combiFac$SaleNormal <- binaryFeature(combiFac$SaleCondition, c('Normal','Partial'))
# Is it an uncompleted sale?
combiFac$SaleComplete <- binaryFeature(combiFac$SaleCondition, 'Partial')
# Is the shape regular?
combiFac$IsRegularShape <- binaryFeature(combiFac$LotShape, 'Reg')
# Is Prime Zone?
combiFac$IsPrimeZone <- binaryFeature(combiFac$MSZoning, c('FV','RL','RP'))
# Is Normal Zone?
combiFac$IsNormalZone <- binaryFeature(combiFac$MSZoning, c('RH','RM'))
## Group Neighborhood based on mean Sale Price
combiNum$NeighborhoodBin <- as.numeric(convertFeature(combiFac$Neighborhood,
c('MeadowV:0','IDOTRR:1','BrDale:1','OldTown:1','Edwards:1',
'BrkSide:1','Sawyer:1','Blueste:1','SWISU:2','NAmes:2',
'NPkVill:2','Mitchel:2','SawyerW:2','Gilbert:2','NWAmes:2',
'Blmngtn:2','CollgCr:2','ClearCr:3','Crawfor:3',
'Veenker:3','Somerst:3','Timber:3','StoneBr:4','NoRidge:4','NridgHt:4')))
# How old is the property when it was sold?
combiNum$Age <- as.numeric(as.character(combiFac$YrSold)) - as.numeric(as.character(combiFac$YearBuilt))
# Convert these categorial feature to numeric because there is an expectation that higher the count, higher the Sale Price
combiNum$BsmtBath <- as.numeric(as.character(combiFac$BsmtFullBath)) + as.numeric(as.character(combiFac$BsmtHalfBath)) * 0.5
combiNum$Bath <- as.numeric(as.character(combiFac$FullBath)) + as.numeric(as.character(combiFac$HalfBath)) * 0.5
combiNum$TotalBath <- combiNum$BsmtBath + combiNum$Bath
combiNum$BedroomAbvGr <- as.numeric(as.character(combiFac$BedroomAbvGr))
combiNum$KitchenAbvGr <- as.numeric(as.character(combiFac$KitchenAbvGr))
combiNum$TotRmsAbvGrd <- as.numeric(as.character(combiFac$TotRmsAbvGrd))
combiNum$Fireplaces <- as.numeric(as.character(combiFac$Fireplaces))
combiNum$GarageCars <- as.numeric(as.character(combiFac$GarageCars))
# Combining the Month and Year. Creating a timestamp feature for each transaction.
combiNum$DtSold <- as.numeric(as.character(combiFac$YrSold)) + as.numeric(as.character(combiFac$MoSold))/12
combiNum$DtSold <- normalize(combiNum$DtSold)
With new features created, deleting some features that may no longer be useful.
combiFac <- combiFac[,!colnames(combiFac) %in% 'LotShape']
combiFac <- combiFac[,!colnames(combiFac) %in% 'LandContour']
combiFac <- combiFac[,!colnames(combiFac) %in% 'LandSlope']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Electrical']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageType']
combiFac <- combiFac[,!colnames(combiFac) %in% 'PavedDrive']
combiFac <- combiFac[,!colnames(combiFac) %in% 'MiscFeature']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Neighborhood']
combiFac <- combiFac[,!colnames(combiFac) %in% 'MSSubClass']
combiFac <- combiFac[,!colnames(combiFac) %in% 'YrSold']
combiFac <- combiFac[,!colnames(combiFac) %in% 'MoSold']
combiFac <- combiFac[,!colnames(combiFac) %in% 'YearRemodAdd']
combiFac <- combiFac[,!colnames(combiFac) %in% 'YearBuilt']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtFullBath']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtHalfBath']
combiFac <- combiFac[,!colnames(combiFac) %in% 'FullBath']
combiFac <- combiFac[,!colnames(combiFac) %in% 'HalfBath']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BedroomAbvGr']
combiFac <- combiFac[,!colnames(combiFac) %in% 'KitchenAbvGr']
combiFac <- combiFac[,!colnames(combiFac) %in% 'TotRmsAbvGrd']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Fireplaces']
combiFac <- combiFac[,!colnames(combiFac) %in% 'OverallQual']
combiFac <- combiFac[,!colnames(combiFac) %in% 'OverallCond']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtQual']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtCond']
combiFac <- combiFac[,!colnames(combiFac) %in% 'HeatingQC']
combiFac <- combiFac[,!colnames(combiFac) %in% 'KitchenQual']
combiFac <- combiFac[,!colnames(combiFac) %in% 'FireplaceQu']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageQual']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageCond']
combiFac <- combiFac[,!colnames(combiFac) %in% 'PoolQC']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtExposure']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtFinType1']
combiFac <- combiFac[,!colnames(combiFac) %in% 'BsmtFinType2']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Functional']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageFinish']
combiFac <- combiFac[,!colnames(combiFac) %in% 'Fence']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageYrBlt']
combiFac <- combiFac[,!colnames(combiFac) %in% 'GarageCars']
combiNum <- combiNum[,!colnames(combiNum) %in% 'X2ndFlrSF']
combiNum <- combiNum[,!colnames(combiNum) %in% 'MasVnrArea']
combiNum <- combiNum[,!colnames(combiNum) %in% 'WoodDeckSF']
Recombine the numeric and categorial features and splitting the dataset back to the training and test dataset. Normalised Sale Price is added back to the training set.
combi2 <- cbind(combiNum,combiFac)
train <- cbind(combi2[1:nrow(training),],SalePrice = training[,c(ncol(training))])
test <- combi2[(nrow(training)+1):nrow(combi2),]
train$SalePrice <- log1p(train$SalePrice)
Splitting the training data into 2 sub dataset (80/20). One for training the model. The other for out of sample validation.
# Data splitting the training set into 2 subset.
# One for training the model. One for out of sample validation.
inTrain <- createDataPartition(y=train$SalePrice, p=0.80, list=FALSE)
validation.data <- train[-inTrain,]
train.data <- train[inTrain,]
traindata <- data.table(train.data)
validationdata <- data.table(validation.data)
Training the model against different combinations of input parameters
set.seed(3567)
numCol <- ncol(traindata)-1
trainx <- Matrix(data.matrix(traindata[,c(1:numCol),with=FALSE]), sparse=TRUE)
trainy <- as.numeric(traindata$SalePrice)
inputValid <- Matrix(data.matrix(validationdata[,c(1:numCol),with=FALSE]), sparse=TRUE)
xgbGrid <- expand.grid(
nrounds = c(10000),
max_depth = seq(3,6,by=1),
eta = seq(0.03,0.05,by=0.01),
gamma = seq(0,1,by=1),
colsample_bytree = seq(0.4,0.6,by = 0.1),
min_child_weight = seq(1,1,by = 0.5),
subsample = seq(0.4,0.6,by = 0.1)
)
rmseErrorsHyperparameters <- apply(xgbGrid, 1, function(parameterList){
#Extract Parameters to test
currentSubsampleRate <- parameterList[["subsample"]]
currentColsampleRate <- parameterList[["colsample_bytree"]]
currentMin_Child_Weight <- parameterList[["min_child_weight"]]
currentGamma <- parameterList[["gamma"]]
currentEta <- parameterList[["eta"]]
currentMax_Depth <- parameterList[["max_depth"]]
currentNrounds <- parameterList[["nrounds"]]
params <- list(objective = "reg:linear",
#booster = "gbtree",
#eta = 2/currentNrounds,
eta = currentEta,
gamma = currentGamma,
max_depth = currentMax_Depth,
min_child_weight = currentMin_Child_Weight,
subsample = currentSubsampleRate,
colsample_bytree = currentColsampleRate)
xgbcv <- xgb.cv(params = params,
data = trainx, label = trainy,
nrounds = currentNrounds, nfold = 5,
showsd = T, stratified = T, early_stopping_rounds = 20, maximize = F)
testrmse <- xgbcv$evaluation_log$test_rmse_mean[xgbcv$best_iteration]
trainrmse <- xgbcv$evaluation_log$train_rmse_mean[xgbcv$best_iteration]
return(c(testrmse, trainrmse, currentSubsampleRate, currentColsampleRate,
currentMin_Child_Weight,currentGamma,currentEta,
currentMax_Depth,currentNrounds,xgbcv$best_iteration))
})
Chosing the best parameters to use that achieved the lowest RMSE
simTrain <- as.data.frame(t(rmseErrorsHyperparameters))
colnames(simTrain) <- c('TestRMSE','TrainRMSE','SubSampleRate','ColSampleRate',
'MinChildWgt','Gamma','ETA','MaxDepth','NRound', 'Iteration')
simTrain$Diff <- simTrain$TestRMSE - simTrain$TrainRMSE
bestTrain <- simTrain[simTrain$TestRMSE == min(simTrain$TestRMSE),]
bestTrain
## TestRMSE TrainRMSE SubSampleRate ColSampleRate MinChildWgt Gamma ETA
## 73 0.1150474 0.0638832 0.5 0.4 1 0 0.03
## MaxDepth NRound Iteration Diff
## 73 3 10000 623 0.0511642
Testing the best model against the validation dataset
params <- list(objective = "reg:linear",
#booster = "gbtree",
eta = bestTrain$ETA,
gamma = bestTrain$Gamma,
max_depth = bestTrain$MaxDepth,
min_child_weight = bestTrain$MinChildWgt,
subsample = bestTrain$SubSampleRate,
colsample_bytree = bestTrain$ColSampleRate
)
xgmodel <- xgboost(params = params, data = trainx, label = trainy, nround = bestTrain$Iteration)
inputValid <- Matrix(data.matrix(validationdata[,c(1:numCol),with=FALSE]), sparse=TRUE)
xgBoostValidation <- predict(xgmodel,inputValid)
predicted <- data.frame(SalePrice=validation.data$SalePrice, Prediction=xgBoostValidation)
Plotting the predicted results against the validation dataset and display the important features
ggplot(predicted, aes(x=SalePrice, y=Prediction)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
mat <- xgb.importance (feature_names = colnames(trainx),model = xgmodel)
xgb.plot.importance (importance_matrix = mat[1:20])
Making a XGBoost prediction and creating the submission file for Kaggle
#PREPARE
testid <- testing$Id
result <- data.frame(id = testid)
#XGBOOST PREDICTION
testdata <- data.table(test)
inputTest <- Matrix(data.matrix(testdata[,c(1:numCol),with=FALSE]), sparse=TRUE)
xgBoostTest <- predict(xgmodel,inputTest)
# RESULT for SUBMISSION
result$SalePrice <- expm1(xgBoostTest)
write.csv(result[,c(1,ncol(result))], "submission.csv", row.names = FALSE)
This is my elementary attempt on practical machine learning. Let me know if I made a mistake and give me feedback how I could improve the model. Thank You!