You will be submiting an entry to Kaggle’s House Prices: Advanced Regression Techniques by fitting a multiple regression model \(\hat{f}(x)\).
Read in data provided by Kaggle for this competition. They are organized in the data/
folder of this RStudio project:
training <- read_csv("data/train.csv")
test <- read_csv("data/test.csv")
sample_submission <- read_csv("data/sample_submission.csv")
Before performing any model fitting, you should always conduct an exploratory data analysis. This will help guide and inform your model fitting.
Always, ALWAYS, ALWAYS start by looking at your raw data. This gives you visual sense of what information you have to help build your predictive models. To get a full description of each variable, read the data dictionary in the data_description.txt
file in the data/
folder.
Note that the following code chunk has eval = FALSE
meaning “don’t evaluate this chunk with knitting” because .Rmd
files won’t knit if they include a View()
:
#View(training)
#glimpse(training)
#View(test)
#glimpse(test)
In particular, pay close attention to the variables and variable types in the sample_submission.csv
. Your submission must match this exactly.
#glimpse(sample_submission)
As much as possible, try to do all your data wrangling here:
### Clean-up the data
# Combine all data for homogenous cleaning
test$SalePrice <- NA # do this so that num of cols match
combined <- rbind(training, test)
# Fix stupid stuff
combined$GarageYrBlt[combined$GarageYrBlt==2207] <- 2007
# Look for fields with lots of NAs
na_col <- which(colSums(is.na(combined)) > 0)
sort(colSums(sapply(combined[na_col], is.na)), decreasing = TRUE)
## PoolQC MiscFeature Alley Fence SalePrice
## 2909 2814 2721 2348 1459
## FireplaceQu LotFrontage GarageYrBlt GarageFinish GarageQual
## 1420 486 159 159 159
## GarageCond GarageType BsmtCond BsmtExposure BsmtQual
## 159 157 82 82 81
## BsmtFinType2 BsmtFinType1 MasVnrType MasVnrArea MSZoning
## 80 79 24 23 4
## Utilities BsmtFullBath BsmtHalfBath Functional Exterior1st
## 2 2 2 2 1
## Exterior2nd BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 1 1 1 1
## Electrical KitchenQual GarageCars GarageArea SaleType
## 1 1 1 1 1
# For the categorical fields where NA = meaningful, change NA to NO
combined$Alley = factor(combined$Alley, levels=c(levels(combined$Alley), "NO"))
combined$Alley[is.na(combined$Alley)] = "NO"
combined$BsmtCond = factor(combined$BsmtCond, levels=c(levels(combined$BsmtCond), "NO"))
combined$BsmtCond[is.na(combined$BsmtCond)] = "NO"
combined$BsmtExposure[is.na(combined$BsmtExposure)] = "NO"
combined$BsmtFinType1 = factor(combined$BsmtFinType1, levels=c(levels(combined$BsmtFinType1), "NO"))
combined$BsmtFinType1[is.na(combined$BsmtFinType1)] = "NO"
combined$BsmtFinType2 = factor(combined$BsmtFinType2, levels=c(levels(combined$BsmtFinType2), "NO"))
combined$BsmtFinType2[is.na(combined$BsmtFinType2)] = "NO"
combined$BsmtQual = factor(combined$BsmtQual, levels=c(levels(combined$BsmtQual), "NO"))
combined$BsmtQual[is.na(combined$BsmtQual)] = "NO"
combined$Electrical = factor(combined$Electrical, levels=c(levels(combined$Electrical), "NO"))
combined$Electrical[is.na(combined$Electrical)] = "NO" # ASSUMED
combined$FireplaceQu = factor(combined$FireplaceQu, levels=c(levels(combined$FireplaceQu), "NO"))
combined$FireplaceQu[is.na(combined$FireplaceQu)] = "NO"
combined$Fence = factor(combined$Fence, levels=c(levels(combined$Fence), "NO"))
combined$Fence[is.na(combined$Fence)] = "NO"
combined$GarageCond = factor(combined$GarageCond, levels=c(levels(combined$GarageCond), "NO"))
combined$GarageCond[is.na(combined$GarageCond)] = "NO"
combined$GarageFinish = factor(combined$GarageFinish, levels=c(levels(combined$GarageFinish), "NO"))
combined$GarageFinish[is.na(combined$GarageFinish)] = "NO"
combined$GarageQual = factor(combined$GarageQual, levels=c(levels(combined$GarageQual), "NO"))
combined$GarageQual[is.na(combined$GarageQual)] = "NO"
combined$GarageType = factor(combined$GarageType, levels=c(levels(combined$GarageType), "NO"))
combined$GarageType[is.na(combined$GarageType)] = "NO"
combined$MasVnrType = factor(combined$MasVnrType, levels=c(levels(combined$MasVnrType), "NO"))
combined$MasVnrType[is.na(combined$MasVnrType)] = "NO"
combined$MiscFeature = factor(combined$MiscFeature, levels=c(levels(combined$MiscFeature), "NO"))
combined$MiscFeature[is.na(combined$MiscFeature)] = "NO"
combined$PoolQC = factor(combined$PoolQC, levels=c(levels(combined$PoolQC), "NO"))
combined$PoolQC[is.na(combined$PoolQC)] = "NO"
combined$Utilities = factor(combined$Utilities, levels=c(levels(combined$Utilities), "NO"))
combined$Utilities[is.na(combined$Utilities)] = "NO" # ASSUMED
# For the categorical fields where NA = missing data, assume most common category
combined$Exterior1st[is.na(combined$Exterior1st)] <- names(sort(-table(combined$Exterior1st)))[1]
combined$Exterior2nd[is.na(combined$Exterior2nd)] <- names(sort(-table(combined$Exterior2nd)))[1]
combined$Functional[is.na(combined$Functional)] <- names(sort(-table(combined$Functional)))[1]
combined$KitchenQual[is.na(combined$KitchenQual)] <- names(sort(-table(combined$KitchenQual)))[1]
combined$MSZoning[is.na(combined$MSZoning)] <- names(sort(-table(combined$MSZoning)))[1]
combined$SaleType[is.na(combined$SaleType)] <- names(sort(-table(combined$SaleType)))[1]
# For the numerical fields where NA = meaningful, make NA 0
combined$BsmtFinSF1[is.na(combined$BsmtFinSF1)] <- 0
combined$BsmtFinSF2[is.na(combined$BsmtFinSF2)] <- 0
combined$BsmtFullBath[is.na(combined$BsmtFullBath)] <- 0
combined$BsmtHalfBath[is.na(combined$BsmtHalfBath)] <- 0
combined$BsmtUnfSF[is.na(combined$BsmtUnfSF)] <- 0
combined$GarageArea[is.na(combined$GarageArea)] <- 0
combined$GarageCars[is.na(combined$GarageCars)] <- 0
combined$GarageYrBlt[is.na(combined$GarageYrBlt)] <- 0
combined$LotFrontage[is.na(combined$LotFrontage)] <- 0
combined$MasVnrArea[is.na(combined$MasVnrArea)] <- 0
combined$TotalBsmtSF[is.na(combined$TotalBsmtSF)] <- 0
# Did we get rid of NAs?
na_col <- which(colSums(is.na(combined)) > 0)
sort(colSums(sapply(combined[na_col], is.na)), decreasing = TRUE)
## SalePrice
## 1459
# Separate the training and test sets again
training <- combined[1:1460,]
test <- combined[1461:2919,]
### Data wrangling
training <- training %>%
dplyr::mutate(log10_SalePrice = log10(SalePrice))
### CV
training <- training %>%
dplyr::sample_frac(1) %>%
dplyr::mutate(fold = rep(1:5, length = n())) %>%
dplyr::arrange(fold)
fold_RMLSE <- tibble(
fold = c(1:5),
RMLSE = 0
)
RMLSE <- rep(0, 5)
for(j in 1:5){
pretend_training <- training %>%
filter(fold != j)
pretend_test <- training %>%
filter(fold == j)
# Fit model on pretend training
#fitted_spline_model <-
#smooth.spline(x = pretend_training$log10_GrLivArea,
#y = pretend_training$log10_SalePrice, df = df)
model_1_formula <- as.formula("SalePrice ~ LotArea + LotShape")
model_1 <- lm(model_1_formula, data = pretend_training)
# Make predictions
#predicted_points <- predict(fitted_spline_model, x = pretend_test$log10_GrLivArea) %>%
#as_tibble()
predicted_points <- model_1 %>%
broom::augment(newdata = pretend_test)
#predicted_points
RMLSE[j] <- rmsle(predicted_points$.fitted,predicted_points$SalePrice)
}
# 3. Make predictions on test data. Compare this to use of broom::augment()
# for fitted_points()
predicted_points_1 <- model_1 %>%
broom::augment(newdata = test)
#predicted_points_1
mean(RMLSE)
## [1] 0.3787659
sample_submission$SalePrice = predicted_points_1$.fitted
write_csv(sample_submission, path = "data/submission_MVP_abby.csv")
Our score based on our submission’s “Root Mean Squared Logarithmic Error” was 0.39814.
Our RMSLE is 0.3787659 and Kaggle says 0.39814, so we’re in the same order of magnitude.
### CV
training <- training %>%
sample_frac(1) %>%
mutate(fold = rep(1:5, length = n())) %>%
arrange(fold)
fold_RMLSE <- tibble(
fold = c(1:5),
RMLSE = 0
)
RMLSE <- rep(0, 5)
for(j in 1:5){
pretend_training <- training %>%
filter(fold != j)
pretend_test <- training %>%
filter(fold == j)
# Fit model on pretend training
#fitted_spline_model <-
#smooth.spline(x = pretend_training$log10_GrLivArea,
#y = pretend_training$log10_SalePrice, df = df)
model_2_formula <- as.formula("SalePrice ~ LotArea + GrLivArea + FullBath + LotShape + OverallCond + HouseStyle")
model_2 <- lm(model_2_formula, data = pretend_training)
# Make predictions
#predicted_points <- predict(fitted_spline_model, x = pretend_test$log10_GrLivArea) %>%
#as_tibble()
predicted_points <- model_2 %>%
broom::augment(newdata = pretend_test)
#predicted_points
RMLSE[j] <- rmsle(predicted_points$.fitted,predicted_points$SalePrice)
}
# 3. Make predictions on test data. Compare this to use of broom::augment()
# for fitted_points()
predicted_points_2 <- model_2 %>%
broom::augment(newdata = test)
#predicted_points_2
mean(RMLSE)
## [1] 0.2407868
# Make submission
sample_submission$SalePrice = predicted_points_2$.fitted
write_csv(sample_submission, path = "data/submission_reach_for_stars_abby.csv")
Our score based on our submission’s “Root Mean Squared Logarithmic Error” was 0.25310.
Our estimated RMSLE for model two is 0.2407868, while Kaggle suggests 0.24868; these are comparable. Model two performed better than model one.
### Stepwise regression
# Make sure we have at least two levels per feature
training = training[, sapply(training, function(col) length(unique(col))) > 1]
# Define rmsle summary--this is b/c caret doesn't have rmsle by default
# REF https://stackoverflow.com/questions/46827054/create-rmsle-metric-in-caret-in-r
custom_summary = function(data, lev = NULL, model = NULL){
out = rmsle(data[, "obs"], data[, "pred"])
names(out) = c("rmsle")
out
}
# Sample splitting as above
fit_ctrl <- caret::trainControl(method = 'cv', number = 5, summaryFunction = custom_summary)
train_idx <- caret::createDataPartition(training$SalePrice, p = 0.8, list=FALSE, times=1)
psuedoTrain <- training[train_idx,]
psuedoTest <- training[-train_idx,]
psuedoTrain <- psuedoTrain[,c("LotArea", "GrLivArea", "FullBath", "LotShape", "OverallCond", "HouseStyle", "SalePrice")]
# Remove log10_SalePrice and fold
psuedoTrain <- psuedoTrain[,1:7]
# Stepwise selection
step.model <- caret::train(SalePrice ~ ., data = psuedoTrain,
method = "leapSeq",
tuneGrid = data.frame(nvmax = 1:6),
trControl = fit_ctrl
)
step.model$bestTune
nvmax |
---|
1 |
summary(step.model$finalModel)
## Subset selection object
## 14 Variables (and intercept)
## Forced in Forced out
## LotArea FALSE FALSE
## GrLivArea FALSE FALSE
## FullBath FALSE FALSE
## LotShapeIR2 FALSE FALSE
## LotShapeIR3 FALSE FALSE
## LotShapeReg FALSE FALSE
## OverallCond FALSE FALSE
## HouseStyle1.5Unf FALSE FALSE
## HouseStyle1Story FALSE FALSE
## HouseStyle2.5Fin FALSE FALSE
## HouseStyle2.5Unf FALSE FALSE
## HouseStyle2Story FALSE FALSE
## HouseStyleSFoyer FALSE FALSE
## HouseStyleSLvl FALSE FALSE
## 1 subsets of each size up to 1
## Selection Algorithm: 'sequential replacement'
## LotArea GrLivArea FullBath LotShapeIR2 LotShapeIR3 LotShapeReg
## 1 ( 1 ) " " "*" " " " " " " " "
## OverallCond HouseStyle1.5Unf HouseStyle1Story HouseStyle2.5Fin
## 1 ( 1 ) " " " " " " " "
## HouseStyle2.5Unf HouseStyle2Story HouseStyleSFoyer HouseStyleSLvl
## 1 ( 1 ) " " " " " " " "
### CV
training <- training %>%
dplyr::sample_frac(1) %>%
dplyr::mutate(fold = rep(1:5, length = n())) %>%
dplyr::arrange(fold)
fold_RMLSE <- tibble(
fold = c(1:5),
RMLSE = 0
)
RMLSE <- rep(0, 5)
for(j in 1:5){
pretend_training <- training %>%
filter(fold != j)
pretend_test <- training %>%
filter(fold == j)
# Fit model on pretend training
#fitted_spline_model <-
#smooth.spline(x = pretend_training$log10_GrLivArea,
#y = pretend_training$log10_SalePrice, df = df)
model_3_formula <- as.formula("SalePrice ~ GrLivArea")
model_3 <- lm(model_3_formula, data = pretend_training)
# Make predictions
#predicted_points <- predict(fitted_spline_model, x = pretend_test$log10_GrLivArea) %>%
#as_tibble()
predicted_points <- model_3 %>%
broom::augment(newdata = pretend_test)
predicted_points
RMLSE[j] <- rmsle(predicted_points$.fitted,predicted_points$SalePrice)
#RMLSE <- mean(RMLSE)
}
# 3. Make predictions on test data. Compare this to use of broom::augment()
# for fitted_points()
predicted_points_3 <- model_3 %>%
broom::augment(newdata = test)
#predicted_points_3
mean(RMLSE)
## [1] 0.2759732
# Make submission
sample_submission$SalePrice = predicted_points_3$.fitted
write_csv(sample_submission, path = "data/submission_diminishing_returns_abby.csv")
Our score based on our submission’s “Root Mean Squared Logarithmic Error” was 0.29200.
Our RMSLE was 0.2770203, while Kaggle reported 0.29200; these are comparable. Model three did not perform better than model 2, but it did perform better than model 1.
# Reset the psuedoTrain df to include desired predictors
psuedoTrain <- training[train_idx,]
psuedoTrain <- psuedoTrain[,2:65]
# Run Boruta feature selection
# REF https://www.machinelearningplus.com/machine-learning/feature-selection/
boruta_output <- Boruta(SalePrice ~ ., data=psuedoTrain, doTrace=0)
boruta_signif <- getSelectedAttributes(boruta_output, withTentative = TRUE)
print(boruta_signif)
## [1] "MSSubClass" "MSZoning" "LotFrontage" "LotArea"
## [5] "LotShape" "LandContour" "LandSlope" "Neighborhood"
## [9] "BldgType" "HouseStyle" "OverallQual" "OverallCond"
## [13] "YearBuilt" "YearRemodAdd" "RoofStyle" "Exterior1st"
## [17] "Exterior2nd" "MasVnrArea" "ExterQual" "Foundation"
## [21] "BsmtExposure" "BsmtFinSF1" "BsmtUnfSF" "TotalBsmtSF"
## [25] "HeatingQC" "CentralAir" "`1stFlrSF`" "`2ndFlrSF`"
## [29] "GrLivArea" "BsmtFullBath" "FullBath" "HalfBath"
## [33] "BedroomAbvGr" "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd"
## [37] "Functional" "Fireplaces" "GarageYrBlt" "GarageCars"
## [41] "GarageArea" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [45] "EnclosedPorch" "ScreenPorch" "SaleCondition"
# Do a tentative rough fix
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "MSSubClass" "MSZoning" "LotArea" "LotShape"
## [5] "Neighborhood" "BldgType" "HouseStyle" "OverallQual"
## [9] "OverallCond" "YearBuilt" "YearRemodAdd" "RoofStyle"
## [13] "Exterior1st" "Exterior2nd" "MasVnrArea" "ExterQual"
## [17] "Foundation" "BsmtExposure" "BsmtFinSF1" "BsmtUnfSF"
## [21] "TotalBsmtSF" "HeatingQC" "CentralAir" "`1stFlrSF`"
## [25] "`2ndFlrSF`" "GrLivArea" "BsmtFullBath" "FullBath"
## [29] "HalfBath" "BedroomAbvGr" "KitchenAbvGr" "KitchenQual"
## [33] "TotRmsAbvGrd" "Functional" "Fireplaces" "GarageYrBlt"
## [37] "GarageCars" "GarageArea" "PavedDrive" "WoodDeckSF"
## [41] "OpenPorchSF" "EnclosedPorch" "SaleCondition"
# Variable Importance Scores
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
meanImp | decision | |
---|---|---|
GrLivArea | 21.78274 | Confirmed |
OverallQual | 17.63715 | Confirmed |
TotalBsmtSF | 14.00293 | Confirmed |
2ndFlrSF |
13.70135 | Confirmed |
YearBuilt | 13.50049 | Confirmed |
GarageArea | 13.30863 | Confirmed |
### Need to do some quick data wrangling b/c some values in training$OverallQual appear too infrequently
# Check frequency of each ranking
as.data.frame(table(unlist(training$OverallQual)))
Var1 | Freq |
---|---|
1 | 2 |
2 | 3 |
3 | 20 |
4 | 116 |
5 | 397 |
6 | 374 |
7 | 319 |
8 | 168 |
9 | 43 |
10 | 18 |
# Right now, the scoring is one to 10, but I'm going to change it to 1 to 3
training$OverallQual[training$OverallQual < 5] <- '1'
test$OverallQual[test$OverallQual < 5] <- '1'
# Re-check
as.data.frame(table(unlist(training$OverallQual)))
Var1 | Freq |
---|---|
1 | 141 |
10 | 18 |
5 | 397 |
6 | 374 |
7 | 319 |
8 | 168 |
9 | 43 |
as.data.frame(table(unlist(test$OverallQual)))
Var1 | Freq |
---|---|
1 | 142 |
10 | 13 |
5 | 428 |
6 | 357 |
7 | 281 |
8 | 174 |
9 | 64 |
### CV
training <- training %>%
dplyr::sample_frac(1) %>%
dplyr::mutate(fold = rep(1:5, length = n())) %>%
dplyr::arrange(fold)
fold_RMLSE <- tibble(
fold = c(1:5),
RMLSE = 0
)
RMLSE <- rep(0, 5)
for(j in 1:5){
pretend_training <- training %>%
filter(fold != j)
pretend_test <- training %>%
filter(fold == j)
# Fit model on pretend training
#fitted_spline_model <-
#smooth.spline(x = pretend_training$log10_GrLivArea,
#y = pretend_training$log10_SalePrice, df = df)
model_4_formula <- as.formula("SalePrice ~ GrLivArea + OverallQual + TotalBsmtSF + `1stFlrSF` + GarageCars + `2ndFlrSF`")
model_4 <- lm(model_4_formula, data = pretend_training)
# Make predictions
#predicted_points <- predict(fitted_spline_model, x = pretend_test$log10_GrLivArea) %>%
#as_tibble()
predicted_points <- model_4 %>%
broom::augment(newdata = pretend_test)
predicted_points
RMLSE[j] <- rmsle(predicted_points$.fitted,predicted_points$SalePrice)
#RMLSE <- mean(RMLSE)
}
# 3. Make predictions on test data. Compare this to use of broom::augment()
# for fitted_points()
predicted_points_4 <- model_4 %>%
broom::augment(newdata = test)
#predicted_points_3
mean(RMLSE)
## [1] 0.1842599
# Make submission
sample_submission$SalePrice = predicted_points_4$.fitted
write_csv(sample_submission, path = "data/canaonball_abby.csv")
Our score based on our submission’s “Root Mean Squared Logarithmic Error” was 0.18575.
Our RMSLE was 0.1831875, while Kaggle reported 0.18575; these are comparable. This model performed the best!