You will be submiting an entry to Kaggle’s House Prices: Advanced Regression Techniques by fitting a multiple regression model \(\hat{f}(x)\).


EDA

Read in data provided by Kaggle for this competition. They are organized in the data/ folder of this RStudio project:

training <- read_csv("data/train.csv")
test <- read_csv("data/test.csv")
sample_submission <- read_csv("data/sample_submission.csv")

Before performing any model fitting, you should always conduct an exploratory data analysis. This will help guide and inform your model fitting.

Look at your data!

Always, ALWAYS, ALWAYS start by looking at your raw data. This gives you visual sense of what information you have to help build your predictive models. To get a full description of each variable, read the data dictionary in the data_description.txt file in the data/ folder.

Note that the following code chunk has eval = FALSE meaning “don’t evaluate this chunk with knitting” because .Rmd files won’t knit if they include a View():

#View(training)
#glimpse(training)

#View(test)
#glimpse(test)

In particular, pay close attention to the variables and variable types in the sample_submission.csv. Your submission must match this exactly.

#glimpse(sample_submission)

Data wrangling

As much as possible, try to do all your data wrangling here:

### Clean-up the data
# Combine all data for homogenous cleaning
test$SalePrice <- NA # do this so that num of cols match
combined <- rbind(training, test)

# Fix stupid stuff
combined$GarageYrBlt[combined$GarageYrBlt==2207] <- 2007

# Look for fields with lots of NAs
na_col <- which(colSums(is.na(combined)) > 0)
sort(colSums(sapply(combined[na_col], is.na)), decreasing = TRUE)
##       PoolQC  MiscFeature        Alley        Fence    SalePrice 
##         2909         2814         2721         2348         1459 
##  FireplaceQu  LotFrontage  GarageYrBlt GarageFinish   GarageQual 
##         1420          486          159          159          159 
##   GarageCond   GarageType     BsmtCond BsmtExposure     BsmtQual 
##          159          157           82           82           81 
## BsmtFinType2 BsmtFinType1   MasVnrType   MasVnrArea     MSZoning 
##           80           79           24           23            4 
##    Utilities BsmtFullBath BsmtHalfBath   Functional  Exterior1st 
##            2            2            2            2            1 
##  Exterior2nd   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF  TotalBsmtSF 
##            1            1            1            1            1 
##   Electrical  KitchenQual   GarageCars   GarageArea     SaleType 
##            1            1            1            1            1
# For the categorical fields where NA = meaningful, change NA to NO
combined$Alley = factor(combined$Alley, levels=c(levels(combined$Alley), "NO"))
combined$Alley[is.na(combined$Alley)] = "NO"
combined$BsmtCond = factor(combined$BsmtCond, levels=c(levels(combined$BsmtCond), "NO"))
combined$BsmtCond[is.na(combined$BsmtCond)] = "NO"
combined$BsmtExposure[is.na(combined$BsmtExposure)] = "NO"
combined$BsmtFinType1 = factor(combined$BsmtFinType1, levels=c(levels(combined$BsmtFinType1), "NO"))
combined$BsmtFinType1[is.na(combined$BsmtFinType1)] = "NO"
combined$BsmtFinType2 = factor(combined$BsmtFinType2, levels=c(levels(combined$BsmtFinType2), "NO"))
combined$BsmtFinType2[is.na(combined$BsmtFinType2)] = "NO"
combined$BsmtQual = factor(combined$BsmtQual, levels=c(levels(combined$BsmtQual), "NO"))
combined$BsmtQual[is.na(combined$BsmtQual)] = "NO"
combined$Electrical = factor(combined$Electrical, levels=c(levels(combined$Electrical), "NO"))
combined$Electrical[is.na(combined$Electrical)] = "NO" # ASSUMED
combined$FireplaceQu = factor(combined$FireplaceQu, levels=c(levels(combined$FireplaceQu), "NO"))
combined$FireplaceQu[is.na(combined$FireplaceQu)] = "NO"
combined$Fence = factor(combined$Fence, levels=c(levels(combined$Fence), "NO"))
combined$Fence[is.na(combined$Fence)] = "NO"
combined$GarageCond = factor(combined$GarageCond, levels=c(levels(combined$GarageCond), "NO"))
combined$GarageCond[is.na(combined$GarageCond)] = "NO"
combined$GarageFinish = factor(combined$GarageFinish, levels=c(levels(combined$GarageFinish), "NO"))
combined$GarageFinish[is.na(combined$GarageFinish)] = "NO"
combined$GarageQual = factor(combined$GarageQual, levels=c(levels(combined$GarageQual), "NO"))
combined$GarageQual[is.na(combined$GarageQual)] = "NO"
combined$GarageType = factor(combined$GarageType, levels=c(levels(combined$GarageType), "NO"))
combined$GarageType[is.na(combined$GarageType)] = "NO"
combined$MasVnrType = factor(combined$MasVnrType, levels=c(levels(combined$MasVnrType), "NO"))
combined$MasVnrType[is.na(combined$MasVnrType)] = "NO"
combined$MiscFeature = factor(combined$MiscFeature, levels=c(levels(combined$MiscFeature), "NO"))
combined$MiscFeature[is.na(combined$MiscFeature)] = "NO"
combined$PoolQC = factor(combined$PoolQC, levels=c(levels(combined$PoolQC), "NO"))
combined$PoolQC[is.na(combined$PoolQC)] = "NO"
combined$Utilities = factor(combined$Utilities, levels=c(levels(combined$Utilities), "NO"))
combined$Utilities[is.na(combined$Utilities)] = "NO" # ASSUMED

# For the categorical fields where NA = missing data, assume most common category
combined$Exterior1st[is.na(combined$Exterior1st)] <- names(sort(-table(combined$Exterior1st)))[1]
combined$Exterior2nd[is.na(combined$Exterior2nd)] <- names(sort(-table(combined$Exterior2nd)))[1]
combined$Functional[is.na(combined$Functional)] <- names(sort(-table(combined$Functional)))[1]
combined$KitchenQual[is.na(combined$KitchenQual)] <- names(sort(-table(combined$KitchenQual)))[1]
combined$MSZoning[is.na(combined$MSZoning)] <- names(sort(-table(combined$MSZoning)))[1]
combined$SaleType[is.na(combined$SaleType)] <- names(sort(-table(combined$SaleType)))[1]


# For the numerical fields where NA = meaningful, make NA 0
combined$BsmtFinSF1[is.na(combined$BsmtFinSF1)] <- 0
combined$BsmtFinSF2[is.na(combined$BsmtFinSF2)] <- 0
combined$BsmtFullBath[is.na(combined$BsmtFullBath)] <- 0
combined$BsmtHalfBath[is.na(combined$BsmtHalfBath)] <- 0
combined$BsmtUnfSF[is.na(combined$BsmtUnfSF)] <- 0
combined$GarageArea[is.na(combined$GarageArea)] <- 0
combined$GarageCars[is.na(combined$GarageCars)] <- 0
combined$GarageYrBlt[is.na(combined$GarageYrBlt)] <- 0
combined$LotFrontage[is.na(combined$LotFrontage)] <- 0
combined$MasVnrArea[is.na(combined$MasVnrArea)] <- 0
combined$TotalBsmtSF[is.na(combined$TotalBsmtSF)] <- 0

# Did we get rid of NAs?
na_col <- which(colSums(is.na(combined)) > 0)
sort(colSums(sapply(combined[na_col], is.na)), decreasing = TRUE)
## SalePrice 
##      1459
# Separate the training and test sets again
training <- combined[1:1460,]
test <- combined[1461:2919,]

### Data wrangling
training <- training %>% 
  dplyr::mutate(log10_SalePrice = log10(SalePrice))

Minimally viable product

Model fitting

### CV
training <- training %>% 
  dplyr::sample_frac(1) %>% 
  dplyr::mutate(fold = rep(1:5, length = n())) %>% 
  dplyr::arrange(fold)

fold_RMLSE <- tibble(
  fold = c(1:5),
  RMLSE = 0
)

  RMLSE <- rep(0, 5)
  for(j in 1:5){
    pretend_training <- training %>% 
      filter(fold != j)
    pretend_test <- training %>% 
      filter(fold == j)
    
    # Fit model on pretend training
    #fitted_spline_model <- 
      #smooth.spline(x = pretend_training$log10_GrLivArea, 
                    #y = pretend_training$log10_SalePrice, df = df)
    
    model_1_formula <- as.formula("SalePrice ~ LotArea + LotShape")
    model_1 <- lm(model_1_formula, data = pretend_training)
    
    # Make predictions
    #predicted_points <- predict(fitted_spline_model, x = pretend_test$log10_GrLivArea) %>%
      #as_tibble()
    
    predicted_points <- model_1 %>%
      broom::augment(newdata = pretend_test)
    #predicted_points
    
    RMLSE[j] <- rmsle(predicted_points$.fitted,predicted_points$SalePrice)
  }
  
  # 3. Make predictions on test data. Compare this to use of broom::augment()
  # for fitted_points()
  predicted_points_1 <- model_1 %>%
    broom::augment(newdata = test)
  #predicted_points_1

Estimate of your Kaggle score

mean(RMLSE)
## [1] 0.3787659

Create your submission CSV

sample_submission$SalePrice = predicted_points_1$.fitted
write_csv(sample_submission, path = "data/submission_MVP_abby.csv")

Screenshot of your Kaggle score

Our score based on our submission’s “Root Mean Squared Logarithmic Error” was 0.39814.


Due diligence

Comparisons of estimated scores and Kaggle scores

Our RMSLE is 0.3787659 and Kaggle says 0.39814, so we’re in the same order of magnitude.


Reaching for the stars

Model fitting

### CV
training <- training %>% 
  sample_frac(1) %>% 
  mutate(fold = rep(1:5, length = n())) %>% 
  arrange(fold)

fold_RMLSE <- tibble(
  fold = c(1:5),
  RMLSE = 0
)

  RMLSE <- rep(0, 5)
  for(j in 1:5){
    pretend_training <- training %>% 
      filter(fold != j)
    pretend_test <- training %>% 
      filter(fold == j)
    
    # Fit model on pretend training
    #fitted_spline_model <- 
      #smooth.spline(x = pretend_training$log10_GrLivArea, 
                    #y = pretend_training$log10_SalePrice, df = df)
    
    model_2_formula <- as.formula("SalePrice ~ LotArea + GrLivArea + FullBath + LotShape + OverallCond + HouseStyle")
    model_2 <- lm(model_2_formula, data = pretend_training)
    
    # Make predictions
    #predicted_points <- predict(fitted_spline_model, x = pretend_test$log10_GrLivArea) %>%
      #as_tibble()
    
    predicted_points <- model_2 %>%
      broom::augment(newdata = pretend_test)
    #predicted_points
    
    RMLSE[j] <- rmsle(predicted_points$.fitted,predicted_points$SalePrice)
  }
  
  # 3. Make predictions on test data. Compare this to use of broom::augment()
  # for fitted_points()
  predicted_points_2 <- model_2 %>%
    broom::augment(newdata = test)
  #predicted_points_2

Estimate of your Kaggle score

mean(RMLSE)
## [1] 0.2407868

Create your submission CSV

# Make submission
sample_submission$SalePrice = predicted_points_2$.fitted
write_csv(sample_submission, path = "data/submission_reach_for_stars_abby.csv")

Screenshot of your Kaggle score

Our score based on our submission’s “Root Mean Squared Logarithmic Error” was 0.25310.

Comparisons of estimated scores and Kaggle scores

Our estimated RMSLE for model two is 0.2407868, while Kaggle suggests 0.24868; these are comparable. Model two performed better than model one.


Point of diminishing returns

Model fitting

1

  ### Stepwise regression


  # Make sure we have at least two levels per feature
  training = training[, sapply(training, function(col) length(unique(col))) > 1]



  # Define rmsle summary--this is b/c caret doesn't have rmsle by default
  # REF https://stackoverflow.com/questions/46827054/create-rmsle-metric-in-caret-in-r
  custom_summary = function(data, lev = NULL, model = NULL){
    out = rmsle(data[, "obs"], data[, "pred"])
    names(out) = c("rmsle")
    out
  }

  # Sample splitting as above
  fit_ctrl <- caret::trainControl(method = 'cv', number = 5, summaryFunction = custom_summary)
  train_idx <- caret::createDataPartition(training$SalePrice, p = 0.8, list=FALSE, times=1)
  psuedoTrain <- training[train_idx,]
  psuedoTest <- training[-train_idx,]
  psuedoTrain <- psuedoTrain[,c("LotArea", "GrLivArea", "FullBath", "LotShape", "OverallCond", "HouseStyle", "SalePrice")]

  # Remove log10_SalePrice and fold
  psuedoTrain <- psuedoTrain[,1:7]

  # Stepwise selection
  step.model <- caret::train(SalePrice ~ ., data = psuedoTrain,
                             method = "leapSeq",
                             tuneGrid = data.frame(nvmax = 1:6),
                             trControl = fit_ctrl

  )

  step.model$bestTune
nvmax
1
  summary(step.model$finalModel)
## Subset selection object
## 14 Variables  (and intercept)
##                  Forced in Forced out
## LotArea              FALSE      FALSE
## GrLivArea            FALSE      FALSE
## FullBath             FALSE      FALSE
## LotShapeIR2          FALSE      FALSE
## LotShapeIR3          FALSE      FALSE
## LotShapeReg          FALSE      FALSE
## OverallCond          FALSE      FALSE
## HouseStyle1.5Unf     FALSE      FALSE
## HouseStyle1Story     FALSE      FALSE
## HouseStyle2.5Fin     FALSE      FALSE
## HouseStyle2.5Unf     FALSE      FALSE
## HouseStyle2Story     FALSE      FALSE
## HouseStyleSFoyer     FALSE      FALSE
## HouseStyleSLvl       FALSE      FALSE
## 1 subsets of each size up to 1
## Selection Algorithm: 'sequential replacement'
##          LotArea GrLivArea FullBath LotShapeIR2 LotShapeIR3 LotShapeReg
## 1  ( 1 ) " "     "*"       " "      " "         " "         " "        
##          OverallCond HouseStyle1.5Unf HouseStyle1Story HouseStyle2.5Fin
## 1  ( 1 ) " "         " "              " "              " "             
##          HouseStyle2.5Unf HouseStyle2Story HouseStyleSFoyer HouseStyleSLvl
## 1  ( 1 ) " "              " "              " "              " "
  ### CV
  training <- training %>%
    dplyr::sample_frac(1) %>%
    dplyr::mutate(fold = rep(1:5, length = n())) %>%
    dplyr::arrange(fold)

  fold_RMLSE <- tibble(
    fold = c(1:5),
    RMLSE = 0
  )


  RMLSE <- rep(0, 5)
  for(j in 1:5){
    pretend_training <- training %>%
      filter(fold != j)
    pretend_test <- training %>%
      filter(fold == j)

    # Fit model on pretend training
    #fitted_spline_model <-
    #smooth.spline(x = pretend_training$log10_GrLivArea,
    #y = pretend_training$log10_SalePrice, df = df)

    model_3_formula <- as.formula("SalePrice ~ GrLivArea")
    model_3 <- lm(model_3_formula, data = pretend_training)

    # Make predictions
    #predicted_points <- predict(fitted_spline_model, x = pretend_test$log10_GrLivArea) %>%
    #as_tibble()

    predicted_points <- model_3 %>%
      broom::augment(newdata = pretend_test)
    predicted_points

    RMLSE[j] <- rmsle(predicted_points$.fitted,predicted_points$SalePrice)
    #RMLSE <- mean(RMLSE)
  }


  # 3. Make predictions on test data. Compare this to use of broom::augment()
  # for fitted_points()
  predicted_points_3 <- model_3 %>%
    broom::augment(newdata = test)
  #predicted_points_3

Estimate of your Kaggle score

  mean(RMLSE)
## [1] 0.2759732

Create your submission CSV

  # Make submission
  sample_submission$SalePrice = predicted_points_3$.fitted
  write_csv(sample_submission, path = "data/submission_diminishing_returns_abby.csv")

Screenshot of your Kaggle score

Our score based on our submission’s “Root Mean Squared Logarithmic Error” was 0.29200.

Comparisons of estimated scores and Kaggle scores

Our RMSLE was 0.2770203, while Kaggle reported 0.29200; these are comparable. Model three did not perform better than model 2, but it did perform better than model 1.

Polishing the cannonball

Model fitting

2

# Reset the psuedoTrain df to include desired predictors
psuedoTrain <- training[train_idx,]
psuedoTrain <- psuedoTrain[,2:65]

# Run Boruta feature selection
# REF https://www.machinelearningplus.com/machine-learning/feature-selection/

boruta_output <- Boruta(SalePrice ~ ., data=psuedoTrain, doTrace=0) 
boruta_signif <- getSelectedAttributes(boruta_output, withTentative = TRUE)
print(boruta_signif) 
##  [1] "MSSubClass"    "MSZoning"      "LotFrontage"   "LotArea"      
##  [5] "LotShape"      "LandContour"   "LandSlope"     "Neighborhood" 
##  [9] "BldgType"      "HouseStyle"    "OverallQual"   "OverallCond"  
## [13] "YearBuilt"     "YearRemodAdd"  "RoofStyle"     "Exterior1st"  
## [17] "Exterior2nd"   "MasVnrArea"    "ExterQual"     "Foundation"   
## [21] "BsmtExposure"  "BsmtFinSF1"    "BsmtUnfSF"     "TotalBsmtSF"  
## [25] "HeatingQC"     "CentralAir"    "`1stFlrSF`"    "`2ndFlrSF`"   
## [29] "GrLivArea"     "BsmtFullBath"  "FullBath"      "HalfBath"     
## [33] "BedroomAbvGr"  "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd" 
## [37] "Functional"    "Fireplaces"    "GarageYrBlt"   "GarageCars"   
## [41] "GarageArea"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [45] "EnclosedPorch" "ScreenPorch"   "SaleCondition"
# Do a tentative rough fix
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
##  [1] "MSSubClass"    "MSZoning"      "LotArea"       "LotShape"     
##  [5] "Neighborhood"  "BldgType"      "HouseStyle"    "OverallQual"  
##  [9] "OverallCond"   "YearBuilt"     "YearRemodAdd"  "RoofStyle"    
## [13] "Exterior1st"   "Exterior2nd"   "MasVnrArea"    "ExterQual"    
## [17] "Foundation"    "BsmtExposure"  "BsmtFinSF1"    "BsmtUnfSF"    
## [21] "TotalBsmtSF"   "HeatingQC"     "CentralAir"    "`1stFlrSF`"   
## [25] "`2ndFlrSF`"    "GrLivArea"     "BsmtFullBath"  "FullBath"     
## [29] "HalfBath"      "BedroomAbvGr"  "KitchenAbvGr"  "KitchenQual"  
## [33] "TotRmsAbvGrd"  "Functional"    "Fireplaces"    "GarageYrBlt"  
## [37] "GarageCars"    "GarageArea"    "PavedDrive"    "WoodDeckSF"   
## [41] "OpenPorchSF"   "EnclosedPorch" "SaleCondition"
# Variable Importance Scores
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
meanImp decision
GrLivArea 21.78274 Confirmed
OverallQual 17.63715 Confirmed
TotalBsmtSF 14.00293 Confirmed
2ndFlrSF 13.70135 Confirmed
YearBuilt 13.50049 Confirmed
GarageArea 13.30863 Confirmed
### Need to do some quick data wrangling b/c some values in training$OverallQual appear too infrequently
  
  # Check frequency of each ranking
  as.data.frame(table(unlist(training$OverallQual)))
Var1 Freq
1 2
2 3
3 20
4 116
5 397
6 374
7 319
8 168
9 43
10 18
  # Right now, the scoring is one to 10, but I'm going to change it to 1 to 3
  training$OverallQual[training$OverallQual < 5] <- '1'

  test$OverallQual[test$OverallQual < 5] <- '1'


  # Re-check
  as.data.frame(table(unlist(training$OverallQual)))
Var1 Freq
1 141
10 18
5 397
6 374
7 319
8 168
9 43
  as.data.frame(table(unlist(test$OverallQual)))
Var1 Freq
1 142
10 13
5 428
6 357
7 281
8 174
9 64
  ### CV
  training <- training %>%
    dplyr::sample_frac(1) %>%
    dplyr::mutate(fold = rep(1:5, length = n())) %>%
    dplyr::arrange(fold)

  fold_RMLSE <- tibble(
    fold = c(1:5),
    RMLSE = 0
  )


  RMLSE <- rep(0, 5)
  for(j in 1:5){
    pretend_training <- training %>%
      filter(fold != j)
    pretend_test <- training %>%
      filter(fold == j)

    # Fit model on pretend training
    #fitted_spline_model <-
    #smooth.spline(x = pretend_training$log10_GrLivArea,
    #y = pretend_training$log10_SalePrice, df = df)

    model_4_formula <- as.formula("SalePrice ~ GrLivArea + OverallQual + TotalBsmtSF + `1stFlrSF` + GarageCars + `2ndFlrSF`")
    model_4 <- lm(model_4_formula, data = pretend_training)

    # Make predictions
    #predicted_points <- predict(fitted_spline_model, x = pretend_test$log10_GrLivArea) %>%
    #as_tibble()

    predicted_points <- model_4 %>%
      broom::augment(newdata = pretend_test)
    predicted_points

    RMLSE[j] <- rmsle(predicted_points$.fitted,predicted_points$SalePrice)
    #RMLSE <- mean(RMLSE)
  }


  # 3. Make predictions on test data. Compare this to use of broom::augment()
  # for fitted_points()
  predicted_points_4 <- model_4 %>%
    broom::augment(newdata = test)
  #predicted_points_3

Estimate of your Kaggle score

  mean(RMLSE)
## [1] 0.1842599

Create your submission CSV

  # Make submission
  sample_submission$SalePrice = predicted_points_4$.fitted
  write_csv(sample_submission, path = "data/canaonball_abby.csv")

Screenshot of your Kaggle score

Our score based on our submission’s “Root Mean Squared Logarithmic Error” was 0.18575.

Comparisons of estimated scores and Kaggle scores

Our RMSLE was 0.1831875, while Kaggle reported 0.18575; these are comparable. This model performed the best!


  1. Note: I used this stackoverflow post to get RMSLE within the caret package source

  2. Note: I used this tutorial to implement the Boruta feature selection source