0) Original Article

https://doi.org/10.1080/10691898.2011.11889627

00) Libraries

library(tidyverse)
library(magrittr) # %<>%
library(AmesHousing)
library(DT) # datatable()
library(ggforce) # facet_wrap_paginate()
library(corrplot)
library(GGally) # ggpairs()
library(moderndive) # get_regression_table() # more info than broom::tidy()
library(car) # vif()
library(leaps)
library(broom) # glance

options(digits = 3, scipen = 999) # remove scientific notation

# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

01) Load the Ames housing dataset.

# ames <- read_csv("Savannah S/Ames/AmesHousing.csv")
ames <- ames_raw
ames %<>%
  rename_all(make.names) # replace space in variable names with .

02) Exploratory Data Analysis

## Continuous
# Lot.Frontage, Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1,
# BsmtFin.SF.2, Bsmt.Unf.SF, Total.Bsmt.SF,
# 1st.Flr.SF, 2nd.Flr.SF,  Low.Qual.Fin.SF, Gr.Liv.Area,
# Garage.Area, Wood.Deck.SF:Pool.Area, Misc.Val, SalePrice   

## Discrete
# Year Built, Year Remod, Bsmt Full Bath, Bsmt Half Bath, Full Bath, 
# Half Bath, Bedroom AbvGr, Kitchen AbvGr, TotRmsAbvGrd,
# Fireplaces, Garage Yr Blt, Garage Cars,
# Mo Sold, Yr Sold, 

# Other
# PID is parcel IDs as chr

# Count NAs in all Variables
var.class <- data.frame(Class =  sapply(ames, class))
var.class <- tibble(Variable = row.names(var.class), 
                    Class = (if_else(var.class$Class == "character",
                                     "Nominal or Ordinal", 
                                     "Continuous or Discrete")))

ames %>% 
  keep(~ sum(is.na(.x)) > 0) %>% # keep columns w/NAs
  gather(key = "Variable") %>% # pivot.longer error with chr class variables
  group_by(Variable) %>%
  summarize(Count_NAs = sum(is.na(value))) %>%
  inner_join(var.class) %>% # make "Class" column
  # filter(Class == "Continuous or Discrete") %>%
  datatable(options = list(pageLength = 5),
            caption = "NAs")
## Joining, by = "Variable"
# Summary of continuous and discrete data
ames.continuous <-
  ames %>%
  select(Lot.Frontage, Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1, BsmtFin.SF.2,
         Bsmt.Unf.SF, Total.Bsmt.SF, X1st.Flr.SF, X2nd.Flr.SF, Low.Qual.Fin.SF,
         Gr.Liv.Area, Garage.Area, Wood.Deck.SF:Pool.Area, Misc.Val, SalePrice) 

ames.continuous %>%
  pivot_longer(cols = everything(), names_to = "Variable") %>%
  group_by(Variable) %>%
  summarize(across(.cols = value,
                   list(Min = ~ min(.x, na.rm = TRUE),
                        Mean = ~ mean(.x, na.rm = TRUE),
                        Median = ~ median(.x, na.rm = TRUE),
                        Max = ~ max(.x, na.rm = TRUE),
                        Mode = getmode, # 0 can mean no mode
                        StdDev = ~ sd(.x, na.rm = TRUE),
                        NAs = ~ sum(is.na(.x))
                        ),
                   .names = "{.fn}"
                   )
            ) %>%
  mutate(across(.cols = !Variable, round, 2)) %>%
  datatable(options = list(pageLength = 5),
            caption = "Summary of Continuous Data")
# Version 2
# ames %>%
#  mutate(across(.cols = PID, as.integer)) %>%
#  select(where(is_integer)) %>%
#  pivot_longer(cols = everything(), names_to = "Variable") %>%
#  group_by(Variable) %>%
#  summarize(Min = min(value, na.rm = TRUE),
#            Mean = mean(value, na.rm = TRUE),
#            Median = median(value, na.rm = TRUE),
#            Max = max(value, na.rm = TRUE),
#            Mode = getmode(value),
#            StdDev = sd(value, na.rm = TRUE),
#            NAs = sum(is.na(value))
#            ) %>%
#  mutate(across(.cols = !Variable, round, 2)) %>%
#  datatable(options = list(pageLength = 5),
#            caption = "Summary of continuous and discrete data")

# Summary of character (categorical) data
# ames %>%
#  select(where(is_character), -PID) %>%
#  mutate(across(.cols = everything(), as_factor)) %>%
#  summary()

# ames %>%
#  select(where(is_character), -PID) %>%
#  pivot_longer(cols = everything(), names_to = "Variable") %>%
#  group_by(Variable) %>%
#  summarise(across(.cols = value,
#                   list(Mode = getmode,
#                        NAs = ~ sum(is.na(.x))),
#                   .names = "{.fn}")
#            ) %>%
#  datatable(options = list(pageLength = 5),
#            caption = "Summary of character (categorical) data")

03) Cleaning Data

# All NAs appear to be correctly and intentionally entered. 
# Some incomplete cases contain conflicting information

# "#" categorical NAs
# "###" discrete/continuous NAs

# Alley = No Alley Access
# Bsmt.Cond = No Basement
# Bsmt.Exposure = No Basement
### Bsmt.Full.Bath = 0, discrete
### Bsmt.Half.Bath =0, discrete
# Bsmt.Qual = No Basement
### Bsmt.Unf.SF = 0, continuous
### BsmtFin.SF.1 = 0, continuous
### BsmtFin.SF.2 = 0, continuous
# BsmtFin.Type.1 = No Basement
# BsmtFin.Type.2 = No Second Category
# Fence = No Fence
# Fireplace.qu = No FirePlace
### Garage.Area = unclear for Order = 2237, continuous 
### Garage.Cars = unclear for Order = 2237, discrete
# Garage.Cond = No Garage
# Garage.Finish = No Garage
# Garage.Qual =  No Garage
# Garage.Type = No Garage
### Garage.Yr.Blt = unclear/unknown for Order = (1357,2237), else no garage, discrete
### Lot.Frontage = 0, continuous
### Mas.Vnr.Area = 0, continuous
# Mas.Vnr.Type = None
# Misc.Feature = None
# Pool.QC = No Pool
### Total.Bsmt.SF = 0, continuous 

# used similar code to identify continuous/discrete NA information:
# ames %>% filter(is.na(Garage.Cars)) %>% select(Order, contains("Garage")) %>% view()

ames.continuous %<>%
  slice(-1357, -2237) %>% # see Garage comments above
  drop_na() # 508 observations

04) Correlation Matrix

ames.continuous %>%
  cor() %>%
  round(digits = 2) %>%
  datatable(options = list(pageLength = 5), caption = "Continuous Correlations")

05) Correlation Plot

ames.continuous %>%
  cor() %>%
  corrplot(type = "upper", 
            diag = FALSE,
            title = "Continuous Correlations",
            order = "alphabet")

06) Scatter Plots

# All continuous correlations with SalePrice
SalePrice.cor <-
  ames.continuous %>%
  cor() %>%
  as.data.frame() %>%
  select(SalePrice)

# max correlation with Saleprice
SalePrice.cor %>%
  arrange(desc(SalePrice)) %>%
  slice(2) 
##             SalePrice
## Gr.Liv.Area     0.707
# min correlation with Saleprice
SalePrice.cor %>%
  slice_min(SalePrice)
##                SalePrice
## Enclosed.Porch    -0.149
# closest to 0.5 correlation with SalePrice
SalePrice.cor %>%
  arrange(desc(SalePrice)) %>%
  filter(near(SalePrice, 0.5, 0.05))
##              SalePrice
## Mas.Vnr.Area     0.533
# Pairs
ames %>%
  select(Gr.Liv.Area, Enclosed.Porch, Mas.Vnr.Area, SalePrice) %>%
  ggpairs()

ames %>%
  select(Gr.Liv.Area, Enclosed.Porch, Mas.Vnr.Area, SalePrice) %>%
  pivot_longer(-SalePrice, names_to = "Variable", values_to = "value") %>%
  ggplot(aes(value, SalePrice)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE) +
  facet_wrap(vars(Variable))

07) Additive Linear Models

# Full Additive Model
model.full <-
  ames.continuous %>%
  lm(SalePrice ~ ., data = .)

# Reduced Model
model.reduced <-
  ames.continuous %>%
  select(-Lot.Area, -Total.Bsmt.SF, -Low.Qual.Fin.SF, -Gr.Liv.Area, -X3Ssn.Porch
         ) %>% # p-value > 0.4 or missing in full model. Gr.Liv.Area in cor plots
  lm(SalePrice ~ ., data = .)

08) Interpret Linear Model

model.reduced %>% 
  get_regression_table(digits = 2) %>%
  datatable(options = list(pageLength = 5),
            caption = "Reduced Linear Model for Predicting SalePrice, Rounded, Blanks = NA")

09) Linear Model Plots

Log(SalePrice) produces better QQ plot but results in model coefficients = 0.

plot(model.reduced, which = 1:6)

10) Multicollinearity

https://www.wikiwand.com/en/Variance_inflation_factor#/Step_three

model.reduced %>%
  vif() %>%
  data.frame(VIF = .) %>% 
  arrange(desc(VIF)) %>%
  mutate(Multicollinearity = if_else(VIF > 10, "True", "False"))
##                 VIF Multicollinearity
## BsmtFin.SF.1   3.77             False
## X1st.Flr.SF    3.66             False
## Bsmt.Unf.SF    3.31             False
## Garage.Area    1.62             False
## Mas.Vnr.Area   1.41             False
## X2nd.Flr.SF    1.39             False
## BsmtFin.SF.2   1.37             False
## Lot.Frontage   1.36             False
## Open.Porch.SF  1.19             False
## Wood.Deck.SF   1.17             False
## Pool.Area      1.07             False
## Enclosed.Porch 1.07             False
## Misc.Val       1.04             False
## Screen.Porch   1.03             False

11) Model Outliers

## Plot Fitted Values
model.full %>%
  fitted() %>%
  tibble() %>%
  ggplot(aes(y= .)) +
  geom_boxplot() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  ggtitle("Full Linear Model Fitted Values")

# Fitted Inner Fences (Weak Outliers)
fitted.inner.fence.lower <-
  quantile(fitted(model.full), 0.25) - 1.5 * IQR(fitted(model.full))

fitted.inner.fence.upper <- 
  quantile(fitted(model.full), 0.75) + 1.5 * IQR(fitted(model.full))

# Identify Fitted Weak Outliers
model.full %>% 
  fitted() %>%
  tibble() %>%
  filter(. < fitted.inner.fence.lower | . > fitted.inner.fence.upper) %>%
  pull() %>%
  names() %>%
  as.integer() ->
  fitted.weak.outlier.index

# Fitted Outer Fences (Strong Outliers)
fitted.outer.fence.lower <-
  quantile(fitted(model.full), 0.25) - 3 * IQR(fitted(model.full))

fitted.outer.fence.upper <- 
  quantile(fitted(model.full), 0.75) + 3 * IQR(fitted(model.full))

# Identify Fitted Strong Outliers
model.full %>% 
  fitted() %>%
  tibble() %>%
  filter(. < fitted.outer.fence.lower | . > fitted.outer.fence.upper) %>%
  pull() %>%
  names() %>%
  as.integer() ->
  fitted.strong.outlier.index

## Plot Residuals
model.full %>%
  residuals() %>%
  tibble() %>%
  ggplot(aes(y= .)) +
  geom_boxplot() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  ggtitle("Linear Model Redisuals")

# Residual Inner Fences (Weak Outliers)
residual.inner.fence.lower <-
  quantile(residuals(model.full), 0.25) - 1.5 * IQR(residuals(model.full))

residual.inner.fence.upper <-
  quantile(residuals(model.full), 0.75) + 1.5 * IQR(residuals(model.full))

# Identify Residual Weak Outliers
model.full %>% 
  residuals() %>%
  tibble() %>%
  filter(. < residual.inner.fence.lower | . > residual.inner.fence.upper) %>%
  pull() %>%
  names() %>%
  as.integer() ->
  residual.weak.outlier.index

# Residual Outer Fences (Strong Outliers)
residual.outer.fence.lower <- 
  quantile(residuals(model.full), 0.25) - 3 * IQR(residuals(model.full))

residual.outer.fence.upper <-
  quantile(residuals(model.full), 0.75) + 3 * IQR(residuals(model.full))

# Identify Residual Strong Outliers
model.full %>% 
  residuals() %>%
  tibble() %>%
  filter(. < residual.outer.fence.lower | . > residual.outer.fence.upper) %>%
  pull() %>%
  names() %>%
  as.integer() ->
  residual.strong.outlier.index

## Positions of all outliers
union(fitted.weak.outlier.index, residual.weak.outlier.index) %>% sort()
##   [1]    3   14   16   32   34   40   42   58  103  113  143  154  159  162  178
##  [16]  204  215  251  252  274  286  296  308  314  317  349  350  351  354  355
##  [31]  358  359  360  361  362  364  375  376  384  413  415  419  427  434  494
##  [46]  508  524  533  549  552  580  582  596  630  653  753  777  794  801  834
##  [61]  836  837  865  866  867  868  870  871  873  874  878  879  882  913  914
##  [76]  932  960  969  971  977  980  991  992 1041 1045 1087 1122 1164 1180 1181
##  [91] 1193 1210 1232 1261 1283 1285 1307 1328 1339 1341 1343 1344 1345 1346 1384
## [106] 1385 1387 1389 1390 1391 1393 1394 1395 1400 1401 1447 1448 1449 1452 1458
## [121] 1464 1466 1519 1520 1525 1551 1553 1560 1600 1627 1636 1641 1682 1700 1717
## [136] 1725 1794 1795 1824 1828 1837 1851 1869 1871 1890 1906 1911 1912 1913 1914
## [151] 1916 1918 1922 1953 1957 1972 2005 2009 2012 2017 2035 2040 2047 2062 2078
## [166] 2094 2121 2131 2133 2171 2185 2193 2224 2238 2249 2381 2388 2403 2413

12) Full Model Reduction for Outliers

# Manually verified that explanatory variable outliers removed in Step (12)

## Model without strong outliers

# strong outlier indices not subsets of each other
fitted.strong.outlier.index %in% residual.strong.outlier.index
## [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE
model.full.strongoutliers <-
  ames.continuous %>%
  slice(-fitted.strong.outlier.index, -residual.strong.outlier.index) %>%
  # select(-X3Ssn.Porch) %>%
  lm(SalePrice ~ ., data = .)

## Model with both types of outliers removed

# weak outlier indices not subsets of each other
sum(fitted.weak.outlier.index %in% residual.weak.outlier.index) == length(fitted.weak.outlier.index)
## [1] FALSE
model.full.weakoutliers <-
  ames.continuous %>%
  slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>%
  # select(-X3Ssn.Porch) %>%
  lm(SalePrice ~ ., data = .)

13) All Subsets Regression Method

rseek.org

model.full %>% alias() # identify linearly dependent variables
## Model :
## SalePrice ~ Lot.Frontage + Lot.Area + Mas.Vnr.Area + BsmtFin.SF.1 + 
##     BsmtFin.SF.2 + Bsmt.Unf.SF + Total.Bsmt.SF + X1st.Flr.SF + 
##     X2nd.Flr.SF + Low.Qual.Fin.SF + Gr.Liv.Area + Garage.Area + 
##     Wood.Deck.SF + Open.Porch.SF + Enclosed.Porch + X3Ssn.Porch + 
##     Screen.Porch + Pool.Area + Misc.Val
## 
## Complete :
##               (Intercept) Lot.Frontage Lot.Area Mas.Vnr.Area BsmtFin.SF.1
## Total.Bsmt.SF 0           0            0        0            1           
## Gr.Liv.Area   0           0            0        0            0           
##               BsmtFin.SF.2 Bsmt.Unf.SF X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF
## Total.Bsmt.SF 1            1           0           0           0              
## Gr.Liv.Area   0            0           1           1           1              
##               Garage.Area Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch
## Total.Bsmt.SF 0           0            0             0              0          
## Gr.Liv.Area   0           0            0             0              0          
##               Screen.Porch Pool.Area Misc.Val
## Total.Bsmt.SF 0            0         0       
## Gr.Liv.Area   0            0         0
ames.continuous.predictors <- 
  ames.continuous %>%
  slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>% # outliers
  select(-Total.Bsmt.SF, -Gr.Liv.Area) # remove linearly dependent variables

# Find best model of each size
models.leaps <-
  leaps(x = ames.continuous.predictors %>% select(-SalePrice),
        y = ames.continuous.predictors$SalePrice,
        method = "adjr2",
        nbest = 1,
        names = ames.continuous.predictors %>% names())

# coeffiences for said models
models.regsubsets <-
  regsubsets(SalePrice ~ .,
           data = ames.continuous.predictors,
           nbest = 1,
           nvmax = NCOL(ames.continuous.predictors) - 1)

# model selected for some balance of highest highest adjusted r^2 and d.f.
models.leaps$adjr2
##  [1] 0.445 0.610 0.713 0.741 0.770 0.779 0.787 0.794 0.799 0.802 0.804 0.805
## [13] 0.805 0.805 0.805 0.805 0.805
# models.leaps$which[12, ]
coef(models.regsubsets, 12) %>% names()
##  [1] "(Intercept)"    "Lot.Area"       "Mas.Vnr.Area"   "BsmtFin.SF.1"  
##  [5] "BsmtFin.SF.2"   "Bsmt.Unf.SF"    "X1st.Flr.SF"    "X2nd.Flr.SF"   
##  [9] "Garage.Area"    "Wood.Deck.SF"   "Open.Porch.SF"  "Enclosed.Porch"
## [13] "Pool.Area"
# create "All Subsets Regression Method" model
model.asr.outliers <-
  ames.continuous.predictors %>%
  select(Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1, BsmtFin.SF.2, Bsmt.Unf.SF,
         X1st.Flr.SF, X2nd.Flr.SF, Garage.Area, Wood.Deck.SF, Open.Porch.SF,
         Enclosed.Porch, Pool.Area, SalePrice) %>%
  slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>%
  lm(SalePrice ~., data = .)

14) Compare Models

bind_rows(model.full %>% glance,
          model.reduced %>% glance,
          model.full.strongoutliers %>% glance,
          model.full.weakoutliers %>% glance,
          model.asr.outliers %>% glance) %>%
  mutate(across(.cols = c(r.squared, adj.r.squared), round, 2)) %>% 
  mutate(across(.cols = c(sigma:nobs), round, 0)) %>%
  mutate(Model = c("Full Additive",
                   "Reduced for significance",
                   "Reduced for Strong Outliers",
                   "Reduced for All Outliers ",
                   "All Subsets Method, Reduced for Outliers & Lin. Dep."),
         .before = "r.squared") %>%
  datatable(caption = "Compare Linear Models, rounded")