if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999)

Preliminary Steps

Loading and Looking at the Data in R

housing.df <- read.csv('WestRoxbury.csv') # load data from file
housing.df <- mlba::WestRoxbury # load data from mlba package
dim(housing.df)  # find the dimension of data frame
head(housing.df)  # show the first six rows
View(housing.df)  # show all the data in a new tab

# Practice showing different subsets of the data
housing.df[1:10, 1]  # show the first 10 rows of the first column only
housing.df[1:10, ]  # show the first 10 rows of each of the columns
housing.df[5, 1:10]  # show the fifth row of the first 10 columns
housing.df[5, c(1:2, 4, 8:10)]  # show the fifth row of some columns
housing.df[, 1]  # show the whole first column
housing.df$TOTAL.VALUE  # a different way to show the whole first column
housing.df$TOTAL.VALUE[1:10]  # show the first 10 rows of the first column
length(housing.df$TOTAL.VALUE)  # find the length of the first column
mean(housing.df$TOTAL.VALUE)  # find the mean of the first column
summary(housing.df)  # find summary statistics for each column

Sampling from a Database

housing.df <- mlba::WestRoxbury

# random sample of 5 observations
s <- sample(row.names(housing.df), 5)
housing.df[s,]
#>      TOTAL.VALUE  TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
#> 599        412.0 5182     8919     1991       3190        2509    1.5     7
#> 4763       735.4 9251    20467     1965       4800        3261    2.0     9
#> 4913       394.9 4967     7000     1930       2379        1188    2.0     6
#> 4646       435.9 5483     6647     1960       3302        1880    2.0     9
#> 3866       346.0 4352     4000     1940       2613        1584    2.0     6
#>      BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
#> 599         4         1         1       1         1    None
#> 4763        4         2         1       1         1     Old
#> 4913        3         1         1       1         0  Recent
#> 4646        4         1         1       1         0    None
#> 3866        3         1         0       1         1    None
# oversample houses with over 10 rooms
s <- sample(row.names(housing.df), 5, prob=ifelse(housing.df$ROOMS>10, 0.9, 0.01))
housing.df[s,]
#>      TOTAL.VALUE  TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
#> 3292       605.5 7617    10233     1910       6549        3845    2.5    12
#> 4740       666.4 8383    12137     1915       5600        3462    2.5    11
#> 2609       410.9 5169     4700     1968       2822        1586    2.0    12
#> 5393       413.0 5195     4876     1930       3440        1942    2.0     7
#> 179        516.7 6500     6000     2005       4623        2597    2.0    11
#>      BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
#> 3292        4         2         1       1         1    None
#> 4740        6         2         0       1         1  Recent
#> 2609        3         2         1       1         1    None
#> 5393        3         1         1       1         1  Recent
#> 179         4         2         1       1         1    None
# rebalance
housing.df$REMODEL <- factor(housing.df$REMODEL)
table(housing.df$REMODEL)
#> 
#>   None    Old Recent 
#>   4346    581    875
upsampled.df <- caret::upSample(housing.df, housing.df$REMODEL, list=TRUE)$x
table(upsampled.df$REMODEL)
#> 
#>   None    Old Recent 
#>   4346   4346   4346

Preprocessing and Cleaning the Data

Types of Variables

library(tidyverse)

# get overview
str(housing.df)
#> 'data.frame':    5802 obs. of  14 variables:
#>  $ TOTAL.VALUE: num  344 413 330 499 332 ...
#>  $ TAX        : int  4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
#>  $ LOT.SQFT   : int  9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
#>  $ YR.BUILT   : int  1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
#>  $ GROSS.AREA : int  2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
#>  $ LIVING.AREA: int  1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
#>  $ FLOORS     : num  2 2 2 1 2 1 2 1 1 2 ...
#>  $ ROOMS      : int  6 10 8 9 7 6 7 6 5 8 ...
#>  $ BEDROOMS   : int  3 4 4 5 3 3 3 3 3 4 ...
#>  $ FULL.BATH  : int  1 2 1 1 2 1 1 1 1 2 ...
#>  $ HALF.BATH  : int  1 1 1 1 0 0 1 0 0 0 ...
#>  $ KITCHEN    : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ FIREPLACE  : int  0 0 0 1 0 1 0 0 1 0 ...
#>  $ REMODEL    : Factor w/ 3 levels "None","Old","Recent": 1 3 1 1 1 2 1 1 3 1 ...
# make REMODEL a factor variable
housing.df$REMODEL <- factor(housing.df$REMODEL)
str(housing.df$REMODEL)
#>  Factor w/ 3 levels "None","Old","Recent": 1 3 1 1 1 2 1 1 3 1 ...
levels(housing.df$REMODEL) # show factor's categories (levels)
#> [1] "None"   "Old"    "Recent"
# use tidyverse to load and preprocess data in one statement
# the %>% operator inserts the result of the expression on the left
# as the first argument into the function on the right
housing.df <- mlba::WestRoxbury %>%
  mutate(REMODEL=factor(REMODEL))

Handling Categorical Variables

library(fastDummies)
library(tidyverse)

housing.df <- dummy_cols(mlba::WestRoxbury,
                 remove_selected_columns=TRUE,  # remove the original column
                 remove_first_dummy=TRUE)  # removes the first created dummy variable
housing.df %>% head(2)
#>   TOTAL.VALUE  TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
#> 1       344.2 4330     9965     1880       2436        1352      2     6
#> 2       412.6 5190     6590     1945       3108        1976      2    10
#>   BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
#> 1        3         1         1       1         0           0              0
#> 2        4         2         1       1         0           0              1

Missing Values

# To illustrate missing data procedures, we first convert a few entries for
# BEDROOMS to NA's. Then we impute these missing values using the median of the
# remaining values.
rows.to.missing <- sample(row.names(housing.df), 10)
housing.df[rows.to.missing,]$BEDROOMS <- NA
summary(housing.df$BEDROOMS)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>    1.00    3.00    3.00    3.23    4.00    9.00      10
# Now we have 10 NA's and the median of the remaining values is 3.

# replace the missing values using the median of the remaining values
# use median() with na.rm=TRUE to ignore missing values when computing the median.
housing.df <- housing.df %>%
  replace_na(list(BEDROOMS=median(housing.df$BEDROOMS, na.rm=TRUE)))

summary(housing.df$BEDROOMS)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>    1.00    3.00    3.00    3.23    4.00    9.00

Predictive Power and Overfitting

Creating and Using Data Partitions

Holdout Partition

housing.df <- mlba::WestRoxbury %>%
  mutate(REMODEL=factor(REMODEL))

# use set.seed() to get the same partitions when re-running the R code.
set.seed(1)

## partitioning into training (60%) and holdout (40%)
# randomly sample 60% of the row IDs for training; the remaining 40% serve
# as holdout
train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.6)
# collect all the columns with training row ID into training set:
train.df <- housing.df[train.rows, ]
# assign row IDs that are not already in the training set, into holdout
holdout.rows <- setdiff(rownames(housing.df), train.rows)
holdout.df <- housing.df[holdout.rows, ]

## partitioning into training (50%), validation (30%), holdout (20%)
# randomly sample 50% of the row IDs for training
train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.5)

# sample 30% of the row IDs into the validation set, drawing only from records
# not already in the training set
# use setdiff() to find records not already in the training set
valid.rows <- sample(setdiff(rownames(housing.df), train.rows),
              nrow(housing.df)*0.3)

# assign the remaining 20% row IDs serve as holdout
holdout.rows <- setdiff(rownames(housing.df), union(train.rows, valid.rows))

# create the 3 data frames by collecting all columns from the appropriate rows
train.df <- housing.df[train.rows, ]
valid.df <- housing.df[valid.rows, ]
holdout.df <- housing.df[holdout.rows, ]

## partitioning into training (60%) and holdout (40%) using caret
set.seed(1)
idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- housing.df[idx, ]
holdout.df <- housing.df[-idx, ]

Building a Predictive Model

Modeling Process

Cross-Validation

library(tidyverse)
library(mlba)
library(fastDummies)

housing.df <- mlba::WestRoxbury %>%
  # remove rows with missing values
  drop_na() %>%
  # remove column TAX
  select(-TAX) %>%
  # make REMODEL a factor and convert to dummy variables
  mutate(REMODEL=factor(REMODEL)) %>%
  dummy_cols(select_columns=c('REMODEL'),
             remove_selected_columns=TRUE, remove_first_dummy=TRUE)
set.seed(1)
idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- housing.df[idx, ]
holdout.df <- housing.df[-idx, ]
reg <- lm(TOTAL.VALUE ~ ., data=train.df)
train.res <- data.frame(actual=train.df$TOTAL.VALUE, predicted=reg$fitted.values,
                        residuals=reg$residuals)
head(train.res)
#>    actual predicted  residuals
#> 1   344.2  384.4206 -40.220638
#> 4   498.6  546.4628 -47.862759
#> 5   331.5  347.9170 -16.417031
#> 12  344.5  380.4297 -35.929727
#> 13  315.5  313.1879   2.312083
#> 15  326.2  345.3751 -19.175064
pred <- predict(reg, newdata=holdout.df)
holdout.res <- data.frame(actual=holdout.df$TOTAL.VALUE, predicted=pred,
                          residuals=holdout.df$TOTAL.VALUE - pred)
head(holdout.res)
#>   actual predicted  residuals
#> 2  412.6  460.2777 -47.677744
#> 3  330.1  359.3920 -29.291958
#> 6  337.4  290.0277  47.372303
#> 7  359.4  402.5332 -43.133242
#> 8  320.4  314.0683   6.331652
#> 9  333.5  339.8206  -6.320582
library(caret)
# compute metrics on training set
data.frame(
    ME = round(mean(train.res$residuals), 5),
    RMSE = RMSE(pred=train.res$predicted, obs=train.res$actual),
    MAE = MAE(pred=train.res$predicted, obs=train.res$actual)
)
#>   ME     RMSE      MAE
#> 1  0 42.14665 31.98717
# compute metrics on holdout set
data.frame(
    ME = round(mean(holdout.res$residuals), 5),
    RMSE = RMSE(pred=holdout.res$predicted, obs=holdout.res$actual),
    MAE = MAE(pred=holdout.res$predicted, obs=holdout.res$actual)
)
#>         ME     RMSE      MAE
#> 1 -1.04237 43.90381 33.05476
# For demonstration purposes, we construct the new.data from the original dataset
housing.df <- mlba::WestRoxbury
new.data <- housing.df[100:102, -1] %>%
  mutate(REMODEL=factor(REMODEL, levels=c("None", "Old", "Recent"))) %>%
  dummy_cols(select_columns=c('REMODEL'),
           remove_selected_columns=TRUE, remove_first_dummy=TRUE)
new.data
#>    TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS FULL.BATH
#> 1 3818     4200     1960       2670        1710    2.0    10        4         1
#> 2 3791     6444     1940       2886        1474    1.5     6        3         1
#> 3 4275     5035     1925       3264        1523    1.0     6        2         1
#>   HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
#> 1         1       1         1           0              0
#> 2         1       1         1           0              0
#> 3         0       1         0           0              1
pred <- predict(reg, newdata = new.data)
pred
#>        1        2        3 
#> 385.9718 378.6392 352.1145