library(tidyverse)
library(magrittr) # %<>%
library(AmesHousing)
library(DT) # datatable()
library(ggforce) # facet_wrap_paginate()
library(corrplot)
library(GGally) # ggpairs()
library(moderndive) # get_regression_table() # more info than broom::tidy()
library(car) # vif()
library(leaps)
library(broom) # glance
options(digits = 3, scipen = 999) # remove scientific notation
# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# ames <- read_csv("Savannah S/Ames/AmesHousing.csv")
ames <- ames_raw
ames %<>%
rename_all(make.names) # replace space in variable names with .
## Continuous
# Lot.Frontage, Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1,
# BsmtFin.SF.2, Bsmt.Unf.SF, Total.Bsmt.SF,
# 1st.Flr.SF, 2nd.Flr.SF, Low.Qual.Fin.SF, Gr.Liv.Area,
# Garage.Area, Wood.Deck.SF:Pool.Area, Misc.Val, SalePrice
## Discrete
# Year Built, Year Remod, Bsmt Full Bath, Bsmt Half Bath, Full Bath,
# Half Bath, Bedroom AbvGr, Kitchen AbvGr, TotRmsAbvGrd,
# Fireplaces, Garage Yr Blt, Garage Cars,
# Mo Sold, Yr Sold,
# Other
# PID is parcel IDs as chr
# Count NAs in all Variables
var.class <- data.frame(Class = sapply(ames, class))
var.class <- tibble(Variable = row.names(var.class),
Class = (if_else(var.class$Class == "character",
"Nominal or Ordinal",
"Continuous or Discrete")))
ames %>%
keep(~ sum(is.na(.x)) > 0) %>% # keep columns w/NAs
gather(key = "Variable") %>% # pivot.longer error with chr class variables
group_by(Variable) %>%
summarize(Count_NAs = sum(is.na(value))) %>%
inner_join(var.class) %>% # make "Class" column
# filter(Class == "Continuous or Discrete") %>%
datatable(options = list(pageLength = 5),
caption = "NAs")
## Joining, by = "Variable"
# Summary of continuous and discrete data
ames.continuous <-
ames %>%
select(Lot.Frontage, Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1, BsmtFin.SF.2,
Bsmt.Unf.SF, Total.Bsmt.SF, X1st.Flr.SF, X2nd.Flr.SF, Low.Qual.Fin.SF,
Gr.Liv.Area, Garage.Area, Wood.Deck.SF:Pool.Area, Misc.Val, SalePrice)
ames.continuous %>%
pivot_longer(cols = everything(), names_to = "Variable") %>%
group_by(Variable) %>%
summarize(across(.cols = value,
list(Min = ~ min(.x, na.rm = TRUE),
Mean = ~ mean(.x, na.rm = TRUE),
Median = ~ median(.x, na.rm = TRUE),
Max = ~ max(.x, na.rm = TRUE),
Mode = getmode, # 0 can mean no mode
StdDev = ~ sd(.x, na.rm = TRUE),
NAs = ~ sum(is.na(.x))
),
.names = "{.fn}"
)
) %>%
mutate(across(.cols = !Variable, round, 2)) %>%
datatable(options = list(pageLength = 5),
caption = "Summary of Continuous Data")
# Version 2
# ames %>%
# mutate(across(.cols = PID, as.integer)) %>%
# select(where(is_integer)) %>%
# pivot_longer(cols = everything(), names_to = "Variable") %>%
# group_by(Variable) %>%
# summarize(Min = min(value, na.rm = TRUE),
# Mean = mean(value, na.rm = TRUE),
# Median = median(value, na.rm = TRUE),
# Max = max(value, na.rm = TRUE),
# Mode = getmode(value),
# StdDev = sd(value, na.rm = TRUE),
# NAs = sum(is.na(value))
# ) %>%
# mutate(across(.cols = !Variable, round, 2)) %>%
# datatable(options = list(pageLength = 5),
# caption = "Summary of continuous and discrete data")
# Summary of character (categorical) data
# ames %>%
# select(where(is_character), -PID) %>%
# mutate(across(.cols = everything(), as_factor)) %>%
# summary()
# ames %>%
# select(where(is_character), -PID) %>%
# pivot_longer(cols = everything(), names_to = "Variable") %>%
# group_by(Variable) %>%
# summarise(across(.cols = value,
# list(Mode = getmode,
# NAs = ~ sum(is.na(.x))),
# .names = "{.fn}")
# ) %>%
# datatable(options = list(pageLength = 5),
# caption = "Summary of character (categorical) data")
# All NAs appear to be correctly and intentionally entered.
# Some incomplete cases contain conflicting information
# "#" categorical NAs
# "###" discrete/continuous NAs
# Alley = No Alley Access
# Bsmt.Cond = No Basement
# Bsmt.Exposure = No Basement
### Bsmt.Full.Bath = 0, discrete
### Bsmt.Half.Bath =0, discrete
# Bsmt.Qual = No Basement
### Bsmt.Unf.SF = 0, continuous
### BsmtFin.SF.1 = 0, continuous
### BsmtFin.SF.2 = 0, continuous
# BsmtFin.Type.1 = No Basement
# BsmtFin.Type.2 = No Second Category
# Fence = No Fence
# Fireplace.qu = No FirePlace
### Garage.Area = unclear for Order = 2237, continuous
### Garage.Cars = unclear for Order = 2237, discrete
# Garage.Cond = No Garage
# Garage.Finish = No Garage
# Garage.Qual = No Garage
# Garage.Type = No Garage
### Garage.Yr.Blt = unclear/unknown for Order = (1357,2237), else no garage, discrete
### Lot.Frontage = 0, continuous
### Mas.Vnr.Area = 0, continuous
# Mas.Vnr.Type = None
# Misc.Feature = None
# Pool.QC = No Pool
### Total.Bsmt.SF = 0, continuous
# used similar code to identify continuous/discrete NA information:
# ames %>% filter(is.na(Garage.Cars)) %>% select(Order, contains("Garage")) %>% view()
ames.continuous %<>%
slice(-1357, -2237) %>% # see Garage comments above
drop_na() # 508 observations
ames.continuous %>%
cor() %>%
round(digits = 2) %>%
datatable(options = list(pageLength = 5), caption = "Continuous Correlations")
ames.continuous %>%
cor() %>%
corrplot(type = "upper",
diag = FALSE,
title = "Continuous Correlations",
order = "alphabet")
# All continuous correlations with SalePrice
SalePrice.cor <-
ames.continuous %>%
cor() %>%
as.data.frame() %>%
select(SalePrice)
# max correlation with Saleprice
SalePrice.cor %>%
arrange(desc(SalePrice)) %>%
slice(2)
## SalePrice
## Gr.Liv.Area 0.707
# min correlation with Saleprice
SalePrice.cor %>%
slice_min(SalePrice)
## SalePrice
## Enclosed.Porch -0.149
# closest to 0.5 correlation with SalePrice
SalePrice.cor %>%
arrange(desc(SalePrice)) %>%
filter(near(SalePrice, 0.5, 0.05))
## SalePrice
## Mas.Vnr.Area 0.533
# Pairs
ames %>%
select(Gr.Liv.Area, Enclosed.Porch, Mas.Vnr.Area, SalePrice) %>%
ggpairs()
ames %>%
select(Gr.Liv.Area, Enclosed.Porch, Mas.Vnr.Area, SalePrice) %>%
pivot_longer(-SalePrice, names_to = "Variable", values_to = "value") %>%
ggplot(aes(value, SalePrice)) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, se = FALSE) +
facet_wrap(vars(Variable))
# Full Additive Model
model.full <-
ames.continuous %>%
lm(SalePrice ~ ., data = .)
# Reduced Model
model.reduced <-
ames.continuous %>%
select(-Lot.Area, -Total.Bsmt.SF, -Low.Qual.Fin.SF, -Gr.Liv.Area, -X3Ssn.Porch
) %>% # p-value > 0.4 or missing in full model. Gr.Liv.Area in cor plots
lm(SalePrice ~ ., data = .)
model.reduced %>%
get_regression_table(digits = 2) %>%
datatable(options = list(pageLength = 5),
caption = "Reduced Linear Model for Predicting SalePrice, Rounded, Blanks = NA")
Log(SalePrice) produces better QQ plot but results in model coefficients = 0.
plot(model.reduced, which = 1:6)
https://www.wikiwand.com/en/Variance_inflation_factor#/Step_three
model.reduced %>%
vif() %>%
data.frame(VIF = .) %>%
arrange(desc(VIF)) %>%
mutate(Multicollinearity = if_else(VIF > 10, "True", "False"))
## VIF Multicollinearity
## BsmtFin.SF.1 3.77 False
## X1st.Flr.SF 3.66 False
## Bsmt.Unf.SF 3.31 False
## Garage.Area 1.62 False
## Mas.Vnr.Area 1.41 False
## X2nd.Flr.SF 1.39 False
## BsmtFin.SF.2 1.37 False
## Lot.Frontage 1.36 False
## Open.Porch.SF 1.19 False
## Wood.Deck.SF 1.17 False
## Pool.Area 1.07 False
## Enclosed.Porch 1.07 False
## Misc.Val 1.04 False
## Screen.Porch 1.03 False
## Plot Fitted Values
model.full %>%
fitted() %>%
tibble() %>%
ggplot(aes(y= .)) +
geom_boxplot() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Full Linear Model Fitted Values")
# Fitted Inner Fences (Weak Outliers)
fitted.inner.fence.lower <-
quantile(fitted(model.full), 0.25) - 1.5 * IQR(fitted(model.full))
fitted.inner.fence.upper <-
quantile(fitted(model.full), 0.75) + 1.5 * IQR(fitted(model.full))
# Identify Fitted Weak Outliers
model.full %>%
fitted() %>%
tibble() %>%
filter(. < fitted.inner.fence.lower | . > fitted.inner.fence.upper) %>%
pull() %>%
names() %>%
as.integer() ->
fitted.weak.outlier.index
# Fitted Outer Fences (Strong Outliers)
fitted.outer.fence.lower <-
quantile(fitted(model.full), 0.25) - 3 * IQR(fitted(model.full))
fitted.outer.fence.upper <-
quantile(fitted(model.full), 0.75) + 3 * IQR(fitted(model.full))
# Identify Fitted Strong Outliers
model.full %>%
fitted() %>%
tibble() %>%
filter(. < fitted.outer.fence.lower | . > fitted.outer.fence.upper) %>%
pull() %>%
names() %>%
as.integer() ->
fitted.strong.outlier.index
## Plot Residuals
model.full %>%
residuals() %>%
tibble() %>%
ggplot(aes(y= .)) +
geom_boxplot() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Linear Model Redisuals")
# Residual Inner Fences (Weak Outliers)
residual.inner.fence.lower <-
quantile(residuals(model.full), 0.25) - 1.5 * IQR(residuals(model.full))
residual.inner.fence.upper <-
quantile(residuals(model.full), 0.75) + 1.5 * IQR(residuals(model.full))
# Identify Residual Weak Outliers
model.full %>%
residuals() %>%
tibble() %>%
filter(. < residual.inner.fence.lower | . > residual.inner.fence.upper) %>%
pull() %>%
names() %>%
as.integer() ->
residual.weak.outlier.index
# Residual Outer Fences (Strong Outliers)
residual.outer.fence.lower <-
quantile(residuals(model.full), 0.25) - 3 * IQR(residuals(model.full))
residual.outer.fence.upper <-
quantile(residuals(model.full), 0.75) + 3 * IQR(residuals(model.full))
# Identify Residual Strong Outliers
model.full %>%
residuals() %>%
tibble() %>%
filter(. < residual.outer.fence.lower | . > residual.outer.fence.upper) %>%
pull() %>%
names() %>%
as.integer() ->
residual.strong.outlier.index
## Positions of all outliers
union(fitted.weak.outlier.index, residual.weak.outlier.index) %>% sort()
## [1] 3 14 16 32 34 40 42 58 103 113 143 154 159 162 178
## [16] 204 215 251 252 274 286 296 308 314 317 349 350 351 354 355
## [31] 358 359 360 361 362 364 375 376 384 413 415 419 427 434 494
## [46] 508 524 533 549 552 580 582 596 630 653 753 777 794 801 834
## [61] 836 837 865 866 867 868 870 871 873 874 878 879 882 913 914
## [76] 932 960 969 971 977 980 991 992 1041 1045 1087 1122 1164 1180 1181
## [91] 1193 1210 1232 1261 1283 1285 1307 1328 1339 1341 1343 1344 1345 1346 1384
## [106] 1385 1387 1389 1390 1391 1393 1394 1395 1400 1401 1447 1448 1449 1452 1458
## [121] 1464 1466 1519 1520 1525 1551 1553 1560 1600 1627 1636 1641 1682 1700 1717
## [136] 1725 1794 1795 1824 1828 1837 1851 1869 1871 1890 1906 1911 1912 1913 1914
## [151] 1916 1918 1922 1953 1957 1972 2005 2009 2012 2017 2035 2040 2047 2062 2078
## [166] 2094 2121 2131 2133 2171 2185 2193 2224 2238 2249 2381 2388 2403 2413
# Manually verified that explanatory variable outliers removed in Step (12)
## Model without strong outliers
# strong outlier indices not subsets of each other
fitted.strong.outlier.index %in% residual.strong.outlier.index
## [1] TRUE TRUE TRUE TRUE FALSE FALSE
model.full.strongoutliers <-
ames.continuous %>%
slice(-fitted.strong.outlier.index, -residual.strong.outlier.index) %>%
# select(-X3Ssn.Porch) %>%
lm(SalePrice ~ ., data = .)
## Model with both types of outliers removed
# weak outlier indices not subsets of each other
sum(fitted.weak.outlier.index %in% residual.weak.outlier.index) == length(fitted.weak.outlier.index)
## [1] FALSE
model.full.weakoutliers <-
ames.continuous %>%
slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>%
# select(-X3Ssn.Porch) %>%
lm(SalePrice ~ ., data = .)
rseek.org
model.full %>% alias() # identify linearly dependent variables
## Model :
## SalePrice ~ Lot.Frontage + Lot.Area + Mas.Vnr.Area + BsmtFin.SF.1 +
## BsmtFin.SF.2 + Bsmt.Unf.SF + Total.Bsmt.SF + X1st.Flr.SF +
## X2nd.Flr.SF + Low.Qual.Fin.SF + Gr.Liv.Area + Garage.Area +
## Wood.Deck.SF + Open.Porch.SF + Enclosed.Porch + X3Ssn.Porch +
## Screen.Porch + Pool.Area + Misc.Val
##
## Complete :
## (Intercept) Lot.Frontage Lot.Area Mas.Vnr.Area BsmtFin.SF.1
## Total.Bsmt.SF 0 0 0 0 1
## Gr.Liv.Area 0 0 0 0 0
## BsmtFin.SF.2 Bsmt.Unf.SF X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF
## Total.Bsmt.SF 1 1 0 0 0
## Gr.Liv.Area 0 0 1 1 1
## Garage.Area Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch
## Total.Bsmt.SF 0 0 0 0 0
## Gr.Liv.Area 0 0 0 0 0
## Screen.Porch Pool.Area Misc.Val
## Total.Bsmt.SF 0 0 0
## Gr.Liv.Area 0 0 0
ames.continuous.predictors <-
ames.continuous %>%
slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>% # outliers
select(-Total.Bsmt.SF, -Gr.Liv.Area) # remove linearly dependent variables
# Find best model of each size
models.leaps <-
leaps(x = ames.continuous.predictors %>% select(-SalePrice),
y = ames.continuous.predictors$SalePrice,
method = "adjr2",
nbest = 1,
names = ames.continuous.predictors %>% names())
# coeffiences for said models
models.regsubsets <-
regsubsets(SalePrice ~ .,
data = ames.continuous.predictors,
nbest = 1,
nvmax = NCOL(ames.continuous.predictors) - 1)
# model selected for some balance of highest highest adjusted r^2 and d.f.
models.leaps$adjr2
## [1] 0.445 0.610 0.713 0.741 0.770 0.779 0.787 0.794 0.799 0.802 0.804 0.805
## [13] 0.805 0.805 0.805 0.805 0.805
# models.leaps$which[12, ]
coef(models.regsubsets, 12) %>% names()
## [1] "(Intercept)" "Lot.Area" "Mas.Vnr.Area" "BsmtFin.SF.1"
## [5] "BsmtFin.SF.2" "Bsmt.Unf.SF" "X1st.Flr.SF" "X2nd.Flr.SF"
## [9] "Garage.Area" "Wood.Deck.SF" "Open.Porch.SF" "Enclosed.Porch"
## [13] "Pool.Area"
# create "All Subsets Regression Method" model
model.asr.outliers <-
ames.continuous.predictors %>%
select(Lot.Area, Mas.Vnr.Area, BsmtFin.SF.1, BsmtFin.SF.2, Bsmt.Unf.SF,
X1st.Flr.SF, X2nd.Flr.SF, Garage.Area, Wood.Deck.SF, Open.Porch.SF,
Enclosed.Porch, Pool.Area, SalePrice) %>%
slice(-fitted.weak.outlier.index, -residual.weak.outlier.index) %>%
lm(SalePrice ~., data = .)
bind_rows(model.full %>% glance,
model.reduced %>% glance,
model.full.strongoutliers %>% glance,
model.full.weakoutliers %>% glance,
model.asr.outliers %>% glance) %>%
mutate(across(.cols = c(r.squared, adj.r.squared), round, 2)) %>%
mutate(across(.cols = c(sigma:nobs), round, 0)) %>%
mutate(Model = c("Full Additive",
"Reduced for significance",
"Reduced for Strong Outliers",
"Reduced for All Outliers ",
"All Subsets Method, Reduced for Outliers & Lin. Dep."),
.before = "r.squared") %>%
datatable(caption = "Compare Linear Models, rounded")