0) Original Article

https://doi.org/10.1080/10691898.2011.11889627

00) Libraries

library(tidyverse)
library(magrittr) # %<>%
library(AmesHousing)
library(DT) # datatable()
library(ggforce) # facet_wrap_paginate()
library(corrplot)
library(GGally) # ggpairs()
library(moderndive) # get_regression_table() # more info than broom::tidy()
library(car) # vif()
library(leaps)
library(broom) # glance

options(digits = 3, scipen = 999) # remove scientific notation

# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

01) Load the Ames housing dataset.

# ames <- read_csv("Savannah S/Ames/AmesHousing.csv")
ames <- ames_raw
ames %<>%
  rename_all(make.names) # replace space in variable names with .

02) Exploratory Data Analysis

## Continuous
# Lot.Frontage, Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1,
# BsmtFin.SF.2, Bsmt.Unf.SF, Total.Bsmt.SF,
# 1st.Flr.SF, 2nd.Flr.SF,  Low.Qual.Fin.SF, Gr.Liv.Area,
# Garage.Area, Wood.Deck.SF:Pool.Area, Misc.Val, SalePrice   

## Discrete
# Year Built, Year Remod, Bsmt Full Bath, Bsmt Half Bath, Full Bath, 
# Half Bath, Bedroom AbvGr, Kitchen AbvGr, TotRmsAbvGrd,
# Fireplaces, Garage Yr Blt, Garage Cars,
# Mo Sold, Yr Sold, 

# Other
# PID is parcel IDs as chr

# Count NAs in all Variables
var.class <- data.frame(Class =  sapply(ames, class))
var.class <- tibble(Variable = row.names(var.class), 
                    Class = (if_else(var.class$Class == "character",
                                     "Nominal or Ordinal", 
                                     "Continuous or Discrete")))

ames %>% 
  keep(~ sum(is.na(.x)) > 0) %>% # keep columns w/NAs
  gather(key = "Variable") %>% # pivot.longer error with chr class variables
  group_by(Variable) %>%
  summarize(Count_NAs = sum(is.na(value))) %>%
  inner_join(var.class) %>% # make "Class" column
  # filter(Class == "Continuous or Discrete") %>%
  datatable(options = list(pageLength = 5),
            caption = "NAs")

## Joining, by = "Variable"

# Summary of continuous and discrete data
ames.continuous <-
  ames %>%
  select(Lot.Frontage, Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1, BsmtFin.SF.2,
         Bsmt.Unf.SF, Total.Bsmt.SF, X1st.Flr.SF, X2nd.Flr.SF, Low.Qual.Fin.SF,
         Gr.Liv.Area, Garage.Area, Wood.Deck.SF:Pool.Area, Misc.Val, SalePrice) 

ames.continuous %>%
  pivot_longer(cols = everything(), names_to = "Variable") %>%
  group_by(Variable) %>%
  summarize(across(.cols = value,
                   list(Min = ~ min(.x, na.rm = TRUE),
                        Mean = ~ mean(.x, na.rm = TRUE),
                        Median = ~ median(.x, na.rm = TRUE),
                        Max = ~ max(.x, na.rm = TRUE),
                        Mode = getmode, # 0 can mean no mode
                        StdDev = ~ sd(.x, na.rm = TRUE),
                        NAs = ~ sum(is.na(.x))
                        ),
                   .names = "{.fn}"
                   )
            ) %>%
  mutate(across(.cols = !Variable, round, 2)) %>%
  datatable(options = list(pageLength = 5),
            caption = "Summary of Continuous Data")

# Version 2
# ames %>%
#  mutate(across(.cols = PID, as.integer)) %>%
#  select(where(is_integer)) %>%
#  pivot_longer(cols = everything(), names_to = "Variable") %>%
#  group_by(Variable) %>%
#  summarize(Min = min(value, na.rm = TRUE),
#            Mean = mean(value, na.rm = TRUE),
#            Median = median(value, na.rm = TRUE),
#            Max = max(value, na.rm = TRUE),
#            Mode = getmode(value),
#            StdDev = sd(value, na.rm = TRUE),
#            NAs = sum(is.na(value))
#            ) %>%
#  mutate(across(.cols = !Variable, round, 2)) %>%
#  datatable(options = list(pageLength = 5),
#            caption = "Summary of continuous and discrete data")

# Summary of character (categorical) data
# ames %>%
#  select(where(is_character), -PID) %>%
#  mutate(across(.cols = everything(), as_factor)) %>%
#  summary()

# ames %>%
#  select(where(is_character), -PID) %>%
#  pivot_longer(cols = everything(), names_to = "Variable") %>%
#  group_by(Variable) %>%
#  summarise(across(.cols = value,
#                   list(Mode = getmode,
#                        NAs = ~ sum(is.na(.x))),
#                   .names = "{.fn}")
#            ) %>%
#  datatable(options = list(pageLength = 5),
#            caption = "Summary of character (categorical) data")

03) Cleaning Data

# All NAs appear to be correctly and intentionally entered. 
# Some incomplete cases contain conflicting information

# "#" categorical NAs
# "###" discrete/continuous NAs

# Alley = No Alley Access
# Bsmt.Cond = No Basement
# Bsmt.Exposure = No Basement
### Bsmt.Full.Bath = 0, discrete
### Bsmt.Half.Bath =0, discrete
# Bsmt.Qual = No Basement
### Bsmt.Unf.SF = 0, continuous
### BsmtFin.SF.1 = 0, continuous
### BsmtFin.SF.2 = 0, continuous
# BsmtFin.Type.1 = No Basement
# BsmtFin.Type.2 = No Second Category
# Fence = No Fence
# Fireplace.qu = No FirePlace
### Garage.Area = unclear for Order = 2237, continuous 
### Garage.Cars = unclear for Order = 2237, discrete
# Garage.Cond = No Garage
# Garage.Finish = No Garage
# Garage.Qual =  No Garage
# Garage.Type = No Garage
### Garage.Yr.Blt = unclear/unknown for Order = (1357,2237), else no garage, discrete
### Lot.Frontage = 0, continuous
### Mas.Vnr.Area = 0, continuous
# Mas.Vnr.Type = None
# Misc.Feature = None
# Pool.QC = No Pool
### Total.Bsmt.SF = 0, continuous 

# used similar code to identify continuous/discrete NA information:
# ames %>% filter(is.na(Garage.Cars)) %>% select(Order, contains("Garage")) %>% view()

ames.continuous %<>%
  slice(-1357, -2237) %>% # see Garage comments above
  drop_na() # 508 observations

04) Correlation Matrix

ames.continuous %>%
  cor() %>%
  round(digits = 2) %>%
  datatable(options = list(pageLength = 5), caption = "Continuous Correlations")

05) Correlation Plot

ames.continuous %>%
  cor() %>%
  corrplot(type = "upper", 
            diag = FALSE,
            title = "Continuous Correlations",
            order = "alphabet")

06) Scatter Plots

# All continuous correlations with SalePrice
SalePrice.cor <-
  ames.continuous %>%
  cor() %>%
  as.data.frame() %>%
  select(SalePrice)

# max correlation with Saleprice
SalePrice.cor %>%
  arrange(desc(SalePrice)) %>%
  slice(2)

##             SalePrice
## Gr.Liv.Area     0.707

# min correlation with Saleprice
SalePrice.cor %>%
  slice_min(SalePrice)

##                SalePrice
## Enclosed.Porch    -0.149

# closest to 0.5 correlation with SalePrice
SalePrice.cor %>%
  arrange(desc(SalePrice)) %>%
  filter(near(SalePrice, 0.5, 0.05))

##              SalePrice
## Mas.Vnr.Area     0.533

# Pairs
ames %>%
  select(Gr.Liv.Area, Enclosed.Porch, Mas.Vnr.Area, SalePrice) %>%
  ggpairs()

ames %>%
  select(Gr.Liv.Area, Enclosed.Porch, Mas.Vnr.Area, SalePrice) %>%
  pivot_longer(-SalePrice, names_to = "Variable", values_to = "value") %>%
  ggplot(aes(value, SalePrice)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE) +
  facet_wrap(vars(Variable))

07) Additive Linear Models

# Full Additive Model
model.full <-
  ames.continuous %>%
  lm(SalePrice ~ ., data = .)

# Reduced Model
model.reduced <-
  ames.continuous %>%
  select(-Lot.Area, -Total.Bsmt.SF, -Low.Qual.Fin.SF, -Gr.Liv.Area, -X3Ssn.Porch
         ) %>% # p-value > 0.4 or missing in full model. Gr.Liv.Area in cor plots
  lm(SalePrice ~ ., data = .)

08) Interpret Linear Model

model.reduced %>% 
  get_regression_table(digits = 2) %>%
  datatable(options = list(pageLength = 5),
            caption = "Reduced Linear Model for Predicting SalePrice, Rounded, Blanks = NA")

09) Linear Model Plots

Log(SalePrice) produces better QQ plot but results in model coefficients = 0.

plot(model.reduced, which = 1:6)

10) Multicollinearity

https://www.wikiwand.com/en/Variance_inflation_factor#/Step_three

model.reduced %>%
  vif() %>%
  data.frame(VIF = .) %>% 
  arrange(desc(VIF)) %>%
  mutate(Multicollinearity = if_else(VIF > 10, "True", "False"))

##                 VIF Multicollinearity
## BsmtFin.SF.1   3.77             False
## X1st.Flr.SF    3.66             False
## Bsmt.Unf.SF    3.31             False
## Garage.Area    1.62             False
## Mas.Vnr.Area   1.41             False
## X2nd.Flr.SF    1.39             False
## BsmtFin.SF.2   1.37             False
## Lot.Frontage   1.36             False
## Open.Porch.SF  1.19             False
## Wood.Deck.SF   1.17             False
## Pool.Area      1.07             False
## Enclosed.Porch 1.07             False
## Misc.Val       1.04             False
## Screen.Porch   1.03             False

11) Model Outliers

## Plot Fitted Values
model.full %>%
  fitted() %>%
  tibble() %>%
  ggplot(aes(y= .)) +
  geom_boxplot() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  ggtitle("Full Linear Model Fitted Values")

# Fitted Inner Fences (Weak Outliers)
fitted.inner.fence.lower <-
  quantile(fitted(model.full), 0.25) - 1.5 * IQR(fitted(model.full))

fitted.inner.fence.upper <- 
  quantile(fitted(model.full), 0.75) + 1.5 * IQR(fitted(model.full))

# Identify Fitted Weak Outliers
model.full %>% 
  fitted() %>%
  tibble() %>%
  filter(. < fitted.inner.fence.lower | . > fitted.inner.fence.upper) %>%
  pull() %>%
  names() %>%
  as.integer() ->
  fitted.weak.outlier.index

# Fitted Outer Fences (Strong Outliers)
fitted.outer.fence.lower <-
  quantile(fitted(model.full), 0.25) - 3 * IQR(fitted(model.full))

fitted.outer.fence.upper <- 
  quantile(fitted(model.full), 0.75) + 3 * IQR(fitted(model.full))

# Identify Fitted Strong Outliers
model.full %>% 
  fitted() %>%
  tibble() %>%
  filter(. < fitted.outer.fence.lower | . > fitted.outer.fence.upper) %>%
  pull() %>%
  names() %>%
  as.integer() ->
  fitted.strong.outlier.index

## Plot Residuals
model.full %>%
  residuals() %>%
  tibble() %>%
  ggplot(aes(y= .)) +
  geom_boxplot() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  ggtitle("Linear Model Redisuals")

# Residual Inner Fences (Weak Outliers)
residual.inner.fence.lower <-
  quantile(residuals(model.full), 0.25) - 1.5 * IQR(residuals(model.full))

residual.inner.fence.upper <-
  quantile(residuals(model.full), 0.75) + 1.5 * IQR(residuals(model.full))

# Identify Residual Weak Outliers
model.full %>% 
  residuals() %>%
  tibble() %>%
  filter(. < residual.inner.fence.lower | . > residual.inner.fence.upper) %>%
  pull() %>%
  names() %>%
  as.integer() ->
  residual.weak.outlier.index

# Residual Outer Fences (Strong Outliers)
residual.outer.fence.lower <- 
  quantile(residuals(model.full), 0.25) - 3 * IQR(residuals(model.full))

residual.outer.fence.upper <-
  quantile(residuals(model.full), 0.75) + 3 * IQR(residuals(model.full))

# Identify Residual Strong Outliers
model.full %>% 
  residuals() %>%
  tibble() %>%
  filter(. < residual.outer.fence.lower | . > residual.outer.fence.upper) %>%
  pull() %>%
  names() %>%
  as.integer() ->
  residual.strong.outlier.index

## Positions of all outliers
union(fitted.weak.outlier.index, residual.weak.outlier.index) %>% sort()

##   [1]    3   14   16   32   34   40   42   58  103  113  143  154  159  162  178
##  [16]  204  215  251  252  274  286  296  308  314  317  349  350  351  354  355
##  [31]  358  359  360  361  362  364  375  376  384  413  415  419  427  434  494
##  [46]  508  524  533  549  552  580  582  596  630  653  753  777  794  801  834
##  [61]  836  837  865  866  867  868  870  871  873  874  878  879  882  913  914
##  [76]  932  960  969  971  977  980  991  992 1041 1045 1087 1122 1164 1180 1181
##  [91] 1193 1210 1232 1261 1283 1285 1307 1328 1339 1341 1343 1344 1345 1346 1384
## [106] 1385 1387 1389 1390 1391 1393 1394 1395 1400 1401 1447 1448 1449 1452 1458
## [121] 1464 1466 1519 1520 1525 1551 1553 1560 1600 1627 1636 1641 1682 1700 1717
## [136] 1725 1794 1795 1824 1828 1837 1851 1869 1871 1890 1906 1911 1912 1913 1914
## [151] 1916 1918 1922 1953 1957 1972 2005 2009 2012 2017 2035 2040 2047 2062 2078
## [166] 2094 2121 2131 2133 2171 2185 2193 2224 2238 2249 2381 2388 2403 2413

12) Full Model Reduction for Outliers

# Manually verified that explanatory variable outliers removed in Step (12)

## Model without strong outliers

# strong outlier indices not subsets of each other
fitted.strong.outlier.index %in% residual.strong.outlier.index

## [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE

model.full.strongoutliers <-
  ames.continuous %>%
  slice(-fitted.strong.outlier.index, -residual.strong.outlier.index) %>%
  # select(-X3Ssn.Porch) %>%
  lm(SalePrice ~ ., data = .)

## Model with both types of outliers removed

# weak outlier indices not subsets of each other
sum(fitted.weak.outlier.index %in% residual.weak.outlier.index) == length(fitted.weak.outlier.index)

## [1] FALSE

model.full.weakoutliers <-
  ames.continuous %>%
  slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>%
  # select(-X3Ssn.Porch) %>%
  lm(SalePrice ~ ., data = .)

13) All Subsets Regression Method

rseek.org

model.full %>% alias() # identify linearly dependent variables

## Model :
## SalePrice ~ Lot.Frontage + Lot.Area + Mas.Vnr.Area + BsmtFin.SF.1 + 
##     BsmtFin.SF.2 + Bsmt.Unf.SF + Total.Bsmt.SF + X1st.Flr.SF + 
##     X2nd.Flr.SF + Low.Qual.Fin.SF + Gr.Liv.Area + Garage.Area + 
##     Wood.Deck.SF + Open.Porch.SF + Enclosed.Porch + X3Ssn.Porch + 
##     Screen.Porch + Pool.Area + Misc.Val
## 
## Complete :
##               (Intercept) Lot.Frontage Lot.Area Mas.Vnr.Area BsmtFin.SF.1
## Total.Bsmt.SF 0           0            0        0            1           
## Gr.Liv.Area   0           0            0        0            0           
##               BsmtFin.SF.2 Bsmt.Unf.SF X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF
## Total.Bsmt.SF 1            1           0           0           0              
## Gr.Liv.Area   0            0           1           1           1              
##               Garage.Area Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch
## Total.Bsmt.SF 0           0            0             0              0          
## Gr.Liv.Area   0           0            0             0              0          
##               Screen.Porch Pool.Area Misc.Val
## Total.Bsmt.SF 0            0         0       
## Gr.Liv.Area   0            0         0

ames.continuous.predictors <- 
  ames.continuous %>%
  slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>% # outliers
  select(-Total.Bsmt.SF, -Gr.Liv.Area) # remove linearly dependent variables

# Find best model of each size
models.leaps <-
  leaps(x = ames.continuous.predictors %>% select(-SalePrice),
        y = ames.continuous.predictors$SalePrice,
        method = "adjr2",
        nbest = 1,
        names = ames.continuous.predictors %>% names())

# coeffiences for said models
models.regsubsets <-
  regsubsets(SalePrice ~ .,
           data = ames.continuous.predictors,
           nbest = 1,
           nvmax = NCOL(ames.continuous.predictors) - 1)

# model selected for some balance of highest highest adjusted r^2 and d.f.
models.leaps$adjr2

##  [1] 0.445 0.610 0.713 0.741 0.770 0.779 0.787 0.794 0.799 0.802 0.804 0.805
## [13] 0.805 0.805 0.805 0.805 0.805

# models.leaps$which[12, ]
coef(models.regsubsets, 12) %>% names()

##  [1] "(Intercept)"    "Lot.Area"       "Mas.Vnr.Area"   "BsmtFin.SF.1"  
##  [5] "BsmtFin.SF.2"   "Bsmt.Unf.SF"    "X1st.Flr.SF"    "X2nd.Flr.SF"   
##  [9] "Garage.Area"    "Wood.Deck.SF"   "Open.Porch.SF"  "Enclosed.Porch"
## [13] "Pool.Area"

# create "All Subsets Regression Method" model
model.asr.outliers <-
  ames.continuous.predictors %>%
  select(Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1, BsmtFin.SF.2, Bsmt.Unf.SF,
         X1st.Flr.SF, X2nd.Flr.SF, Garage.Area, Wood.Deck.SF, Open.Porch.SF,
         Enclosed.Porch, Pool.Area, SalePrice) %>%
  slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>%
  lm(SalePrice ~., data = .)

14) Compare Models

bind_rows(model.full %>% glance,
          model.reduced %>% glance,
          model.full.strongoutliers %>% glance,
          model.full.weakoutliers %>% glance,
          model.asr.outliers %>% glance) %>%
  mutate(across(.cols = c(r.squared, adj.r.squared), round, 2)) %>% 
  mutate(across(.cols = c(sigma:nobs), round, 0)) %>%
  mutate(Model = c("Full Additive",
                   "Reduced for significance",
                   "Reduced for Strong Outliers",
                   "Reduced for All Outliers ",
                   "All Subsets Method, Reduced for Outliers & Lin. Dep."),
         .before = "r.squared") %>%
  datatable(caption = "Compare Linear Models, rounded")

04.16.2021 Savannah S: AMES

Nathan at GrandValleyTutor.com

4/15/2021

0) Original Article

00) Libraries

01) Load the Ames housing dataset.

02) Exploratory Data Analysis

03) Cleaning Data

04) Correlation Matrix

05) Correlation Plot

06) Scatter Plots

07) Additive Linear Models

08) Interpret Linear Model

09) Linear Model Plots

10) Multicollinearity

11) Model Outliers

12) Full Model Reduction for Outliers

13) All Subsets Regression Method

14) Compare Models