The purpose of this project is to determine optimal feature combinations for a multiple regression prediction of home sales prices.
The Iowa Homes dataset was used, obtained from Kaggle.com (andradaotenau, 2019) and uploaded in R Studio, an interactive development environment (IDE) allowing for both code inspection and the inclusion of descriptions in the RMarkdown file accompanying this report. The data had been previously split into a train.csv for the training set and a test.csv for testing models developed. For this project, the train.csv file was used but additionally split for cross validation as discussed later in this report.
#Load packages
library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.1
## Warning: package 'purrr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
library(boot)
library(ggplot2)
library(moments)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.1
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(rsample)
## Warning: package 'rsample' was built under R version 4.5.2
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.5.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.11 ✔ tailor 0.1.0
## ✔ dials 1.4.2 ✔ tune 2.0.1
## ✔ infer 1.1.0 ✔ workflows 1.3.0
## ✔ modeldata 1.5.1 ✔ workflowsets 1.1.1
## ✔ parsnip 1.4.0 ✔ yardstick 1.3.2
## ✔ recipes 1.3.1
## Warning: package 'broom' was built under R version 4.5.2
## Warning: package 'dials' was built under R version 4.5.2
## Warning: package 'modeldata' was built under R version 4.5.2
## Warning: package 'parsnip' was built under R version 4.5.2
## Warning: package 'tailor' was built under R version 4.5.2
## Warning: package 'tune' was built under R version 4.5.2
## Warning: package 'workflows' was built under R version 4.5.2
## Warning: package 'workflowsets' was built under R version 4.5.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(parsnip)
library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:boot':
##
## melanoma
##
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:rsample':
##
## calibration
##
## The following object is masked from 'package:purrr':
##
## lift
library(yardstick)
#Upload dataset from Kaggle.com: andradaolteanu. (2019, August 18). Housing Prices Competition - Iowa Dataset. Kaggle.com; Kaggle. https://www.kaggle.com/code/andradaolteanu/housing-prices-competition-iowa-dataset?select=test.csv
getwd()
## [1] "C:/Users/benke/Downloads/New folder (3)"
setwd("C:/Users/benke/Downloads/New folder (3)")
house2 <- read.csv("Iowa.house.train.csv")
head(house2)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
The dataset consists of 81 columns and 1460 observations. Of the 81 variables, 43 were nominal, character data types, and 38 were numeric. Of the 38 numeric variables, three variables were rating scales and were reclassified as factors for to ensure these variables were not mistakenly included in quantitative analysis as numeric data rather than the ordinal, categorical nature of the data type.
#View characteristics of the data
glimpse(house2)
## Rows: 1,460
## Columns: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
## $ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
## $ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
## $ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
## $ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
## $ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
## $ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
## $ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
## $ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
## $ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
## $ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
## $ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
## $ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
## $ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
## $ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
## $ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
## $ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
## $ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
## $ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
## $ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
## $ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
## $ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
## $ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
str(house2)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
#View summary statistics to identify further pre-processing requirements, such as possible outliers and missing values.
summary(house2)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
Data inspection revealed a total NA count of 6965 NAs in the dataset, with 100% of observations having at least 1 NA listed. In reviewing variables, 19 columns for a total of 23% of columns also included NAs. Due to the high number of NAs, mean and mode imputation was used to remove NAs for preprocessing.
#Count NAs
total_na_count <- sum(is.na(house2))
total_na_count
## [1] 6965
#Count rows with NA and calculate percentage of rows with NAs
rows_with_nas <- sum(rowSums(is.na(house2)) > 0)
Percent_row_NA <- percent(rows_with_nas / nrow(house2))
rows_with_nas
## [1] 1460
Percent_row_NA
## [1] "100%"
#Count columns with NA and calculate percentage of columns with NAs
cols_with_nas <- sum(colSums(is.na(house2)) > 0)
Percent_col_NA <- percent(cols_with_nas / length(house2))
cols_with_nas
## [1] 19
Percent_col_NA
## [1] "23%"
#Categorical columns using a number as a quality rating or category, such as month sold, are converted to factors
factor_columns <- c("OverallQual", "OverallCond", "MoSold")
house2[factor_columns] <- lapply(house2[factor_columns], function(col) as.factor(as.character(col)))
# Replace NAs with appropriate values
# Numeric: Replace with the mean if sufficient data is available
# Categorical: Replace with the mode (most common value)
# Character: Replace with the string "NA"
house2 <- lapply(house2, function(col) {
if (is.numeric(col) || is.integer(col)) { # Numeric or integer columns
if (sum(!is.na(col)) > 10) {
col[is.na(col)] <- mean(col, na.rm = TRUE) # Replace with mean
} else {
col[is.na(col)] <- approx(seq_along(col), col, n = length(col))[["y"]][is.na(col)] # Interpolation
}
} else if (is.factor(col)) { # Factor columns
mode_val <- names(sort(-table(col)))[1] # Mode (most common value)
col[is.na(col)] <- mode_val
} else if (is.character(col)) { # Character columns
col[is.na(col)] <- "NA" # Replace with "NA"
}
return(col) # Return the modified column
})
house2 <- as.data.frame(house2) # Convert the list back to a dataframe
#
# following the above method to impute, has now changed some of the statistics
# Check the updated dataset and ensure no remaining NAs
summary(house2)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 60.00
## Median : 730.5 Median : 50.0 Mode :character Median : 70.05
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 79.00
## Max. :1460.0 Max. :190.0 Max. :313.00
##
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## Length:1460 5 :397 5 :821 Min. :1872 Min. :1950
## Class :character 6 :374 6 :252 1st Qu.:1954 1st Qu.:1967
## Mode :character 7 :319 7 :205 Median :1973 Median :1994
## 8 :168 8 : 72 Mean :1971 Mean :1985
## 4 :116 4 : 57 3rd Qu.:2000 3rd Qu.:2004
## 9 : 43 3 : 25 Max. :2010 Max. :2010
## (Other): 43 (Other): 28
## RoofStyle RoofMatl Exterior1st Exterior2nd
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## MasVnrType MasVnrArea ExterQual ExterCond
## Length:1460 Min. : 0.0 Length:1460 Length:1460
## Class :character 1st Qu.: 0.0 Class :character Class :character
## Mode :character Median : 0.0 Mode :character Mode :character
## Mean : 103.7
## 3rd Qu.: 164.2
## Max. :1600.0
##
## Foundation BsmtQual BsmtCond BsmtExposure
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## Length:1460 Min. : 0.0 Length:1460 Min. : 0.00
## Class :character 1st Qu.: 0.0 Class :character 1st Qu.: 0.00
## Mode :character Median : 383.5 Mode :character Median : 0.00
## Mean : 443.6 Mean : 46.55
## 3rd Qu.: 712.2 3rd Qu.: 0.00
## Max. :5644.0 Max. :1474.00
##
## BsmtUnfSF TotalBsmtSF Heating HeatingQC
## Min. : 0.0 Min. : 0.0 Length:1460 Length:1460
## 1st Qu.: 223.0 1st Qu.: 795.8 Class :character Class :character
## Median : 477.5 Median : 991.5 Mode :character Mode :character
## Mean : 567.2 Mean :1057.4
## 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :2336.0 Max. :6110.0
##
## CentralAir Electrical X1stFlrSF X2ndFlrSF
## Length:1460 Length:1460 Min. : 334 Min. : 0
## Class :character Class :character 1st Qu.: 882 1st Qu.: 0
## Mode :character Mode :character Median :1087 Median : 0
## Mean :1163 Mean : 347
## 3rd Qu.:1391 3rd Qu.: 728
## Max. :4692 Max. :2065
##
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## Min. : 0.000 Min. : 334 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 0.000 Median :1464 Median :0.0000 Median :0.00000
## Mean : 5.845 Mean :1515 Mean :0.4253 Mean :0.05753
## 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :572.000 Max. :5642 Max. :3.0000 Max. :2.00000
##
## FullBath HalfBath BedroomAbvGr KitchenAbvGr
## Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000
## Median :2.000 Median :0.0000 Median :3.000 Median :1.000
## Mean :1.565 Mean :0.3829 Mean :2.866 Mean :1.047
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :3.000 Max. :2.0000 Max. :8.000 Max. :3.000
##
## KitchenQual TotRmsAbvGrd Functional Fireplaces
## Length:1460 Min. : 2.000 Length:1460 Min. :0.000
## Class :character 1st Qu.: 5.000 Class :character 1st Qu.:0.000
## Mode :character Median : 6.000 Mode :character Median :1.000
## Mean : 6.518 Mean :0.613
## 3rd Qu.: 7.000 3rd Qu.:1.000
## Max. :14.000 Max. :3.000
##
## FireplaceQu GarageType GarageYrBlt GarageFinish
## Length:1460 Length:1460 Min. :1900 Length:1460
## Class :character Class :character 1st Qu.:1962 Class :character
## Mode :character Mode :character Median :1979 Mode :character
## Mean :1979
## 3rd Qu.:2001
## Max. :2010
##
## GarageCars GarageArea GarageQual GarageCond
## Min. :0.000 Min. : 0.0 Length:1460 Length:1460
## 1st Qu.:1.000 1st Qu.: 334.5 Class :character Class :character
## Median :2.000 Median : 480.0 Mode :character Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## Length:1460 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Class :character 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Median : 0.00 Median : 25.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00
##
## X3SsnPorch ScreenPorch PoolArea PoolQC
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Length:1460
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 Class :character
## Median : 0.00 Median : 0.00 Median : 0.000 Mode :character
## Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :508.00 Max. :480.00 Max. :738.000
##
## Fence MiscFeature MiscVal MoSold
## Length:1460 Length:1460 Min. : 0.00 6 :253
## Class :character Class :character 1st Qu.: 0.00 7 :234
## Mode :character Mode :character Median : 0.00 5 :204
## Mean : 43.49 4 :141
## 3rd Qu.: 0.00 8 :122
## Max. :15500.00 3 :106
## (Other):400
## YrSold SaleType SaleCondition SalePrice
## Min. :2006 Length:1460 Length:1460 Min. : 34900
## 1st Qu.:2007 Class :character Class :character 1st Qu.:129975
## Median :2008 Mode :character Mode :character Median :163000
## Mean :2008 Mean :180921
## 3rd Qu.:2009 3rd Qu.:214000
## Max. :2010 Max. :755000
##
The presence of outliers was also investigated and managed in the pre-processing step. Box plot visualization was used for four variables (lot area, total basement square footage, basement square footage (BsmtFinSF1) and general living area (GrLivArea)) previously reported as important features for the prediction of home sale prices (Sharma et al., 2024). Box plot visualization indicated outlier data points may affect model performance. To eliminate outliers for model training, observations with a general living area that was either less than 1.5IQR1 or more than 1.5IQR3 were removed from the dataset (Zach, 2020). General living area was chosen as the feature for eliminating outliers due to the likelihood of omitting very large or very small houses from the dataset, which may negatively affect model training. Once the outliers were removed, the dataset was split into training and testing datasets.
#Boxplot visualizations for variables with showing max values appearing far higher than 3rd quartile to inspect for outliers
col = "LotArea"
if (is.factor(house2[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar graph
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
ggplot(house2, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
ggplot(house2, aes(x = .data[[col]])) +
geom_histogram(binwidth = 0.3) +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
col = "LotArea"
ggplot(house2, aes(x = "", y = .data[[col]])) +
geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
labs(
title = paste("Box Plot for", col),
x = NULL,
y = "Value"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.y = element_text(size = 14),
axis.text.y = element_text(size = 12)
)
col = "BsmtFinSF1"
if (is.factor(house2[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar graph
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
ggplot(house2, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
ggplot(house2, aes(x = .data[[col]])) +
geom_histogram(binwidth = 0.3) +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
col = "BsmtFinSF1"
ggplot(house2, aes(x = "", y = .data[[col]])) +
geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
labs(
title = paste("Box Plot for", col),
x = NULL,
y = "Value"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.y = element_text(size = 14),
axis.text.y = element_text(size = 12)
)
col = "TotalBsmtSF"
if (is.factor(house2[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar graph
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
ggplot(house2, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
ggplot(house2, aes(x = .data[[col]])) +
geom_histogram(binwidth = 0.3) +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
ggplot(house2, aes(x = "", y = .data[[col]])) +
geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
labs(
title = paste("Box Plot for", col),
x = NULL,
y = "Value"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.y = element_text(size = 14),
axis.text.y = element_text(size = 12)
)
col = "GrLivArea"
if (is.factor(house2[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar graph
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
ggplot(house2, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
ggplot(house2, aes(x = .data[[col]])) +
geom_histogram(binwidth = 0.3) +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
ggplot(house2, aes(x = "", y = .data[[col]])) +
geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
labs(
title = paste("Box Plot for", col),
x = NULL,
y = "Value"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.y = element_text(size = 14),
axis.text.y = element_text(size = 12)
)
#Identify and remove outliers Zach. (2020, August 6). How to Remove Outliers in R. Statology. https://www.statology.org/remove-outliers-r/
#Specify interquartile range for numeric variables with outliers
Q1 <- quantile(house2$GrLivArea, .25)
Q3 <- quantile(house2$GrLivArea, .75)
IQR <- IQR(house2$GrLivArea)
#Keep rows without outliers (values within 1.5*IQR of Q1 and Q3)
house2_no_out <- subset(house2, house2$GrLivArea> (Q1 - 1.5*IQR) & house2$GrLivArea< (Q3 + 1.5*IQR))
#View new dataset to determine data loss
dim(house2_no_out)
## [1] 1429 81
col = "GrLivArea"
if (is.factor(house2_no_out[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar graph
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
ggplot(house2_no_out, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
ggplot(house2_no_out, aes(x = .data[[col]])) +
geom_histogram(binwidth = 0.3) +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
ggplot(house2_no_out, aes(x = "", y = .data[[col]])) +
geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
labs(
title = paste("Box Plot for", col),
x = NULL,
y = "Value"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.y = element_text(size = 14),
axis.text.y = element_text(size = 12)
)
str(house2_no_out)
## 'data.frame': 1429 obs. of 81 variables:
## $ Id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : num 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : num 65 80 68 60 84 ...
## $ LotArea : num 8450 9600 11250 9550 14260 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr "NA" "NA" "NA" "NA" ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : Factor w/ 10 levels "1","10","2","3",..: 8 7 8 8 9 6 9 8 8 6 ...
## $ OverallCond : Factor w/ 9 levels "1","2","3","4",..: 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd : num 2003 1976 2002 1970 2000 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : num 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : num 706 978 486 216 655 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : num 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : num 856 1262 920 756 1145 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : num 856 1262 920 961 1145 ...
## $ X2ndFlrSF : num 854 0 866 756 1053 ...
## $ LowQualFinSF : num 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num 1710 1262 1786 1717 2198 ...
## $ BsmtFullBath : num 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : num 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : num 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : num 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : num 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : num 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : num 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr "NA" "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : num 2003 1976 2001 1998 2000 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : num 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : num 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : num 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: num 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : num 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr "NA" "NA" "NA" "NA" ...
## $ Fence : chr "NA" "NA" "NA" "NA" ...
## $ MiscFeature : chr "NA" "NA" "NA" "NA" ...
## $ MiscVal : num 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : Factor w/ 12 levels "1","10","11",..: 5 8 12 5 4 2 11 3 7 1 ...
## $ YrSold : num 2008 2007 2008 2006 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num 208500 181500 223500 140000 250000 ...
Following pre-processing, the dataset was split into a training and a testing dataset with 80% training data and 20% testing data. To identify relationships between variables for use in the multiple regression model, a subset of numeric variables, including variables reported by Sharma et al, (2024) as important features, were combined in a data frame for calculation of a correlation table.
#Divide train dataset into a training and a testing dataset for cross-validation.
set.seed(123)
house2_no_out_split <- initial_split(house2_no_out, prop = .80)
train_house <- training(house2_no_out_split)
test_house <- testing(house2_no_out_split)
head(train_house)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 422 20 RL 70.04996 16635 Pave NA IR1 Lvl
## 2 470 60 RL 76.00000 9291 Pave NA IR1 Lvl
## 3 181 160 FV 70.04996 2117 Pave NA Reg Lvl
## 4 535 60 RL 74.00000 9056 Pave NA IR1 Lvl
## 5 199 75 RM 92.00000 5520 Pave NA Reg Lvl
## 6 954 60 RL 70.04996 11075 Pave NA IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub FR2 Gtl NWAmes Norm Norm 1Fam
## 2 AllPub Corner Gtl SawyerW RRNe Norm 1Fam
## 3 AllPub Inside Gtl Somerst Norm Norm Twnhs
## 4 AllPub Inside Gtl Gilbert Norm Norm 1Fam
## 5 AllPub Corner Gtl OldTown Norm Norm 1Fam
## 6 AllPub Inside Mod Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 1Story 6 7 1977 2000 Gable CompShg
## 2 2Story 6 5 1993 1993 Gable CompShg
## 3 2Story 6 5 2000 2000 Gable CompShg
## 4 2Story 8 5 2004 2004 Gable CompShg
## 5 2.5Fin 6 6 1912 1950 Gable CompShg
## 6 2Story 5 4 1969 1969 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 CemntBd CmentBd Stone 126 Gd TA CBlock
## 2 HdBoard HdBoard BrkFace 120 Gd TA PConc
## 3 MetalSd MetalSd BrkFace 456 Gd TA PConc
## 4 VinylSd VinylSd None 0 Gd TA PConc
## 5 Wd Sdng Wd Sdng None 0 TA TA CBlock
## 6 HdBoard HdBoard BrkFace 232 TA TA CBlock
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No ALQ 1246 Unf
## 2 Gd TA No GLQ 426 Unf
## 3 Gd TA No GLQ 436 Unf
## 4 Ex Gd Av Unf 0 Unf
## 5 TA TA No Unf 0 Unf
## 6 TA TA Av ALQ 562 LwQ
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 356 1602 GasA Gd Y SBrkr
## 2 0 406 832 GasA Ex Y SBrkr
## 3 0 320 756 GasA Ex Y SBrkr
## 4 0 707 707 GasA Ex Y SBrkr
## 5 0 755 755 GasA Ex Y SBrkr
## 6 193 29 784 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 1602 0 0 1602 0 1 2
## 2 832 878 0 1710 0 0 2
## 3 769 756 0 1525 0 0 2
## 4 707 707 0 1414 0 0 2
## 5 929 929 371 2229 0 0 1
## 6 1168 800 0 1968 0 1 2
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 0 3 1 Gd 8 Typ
## 2 1 3 1 Gd 7 Typ
## 3 1 3 1 Gd 5 Typ
## 4 1 3 1 Gd 6 Typ
## 5 0 5 1 TA 8 Typ
## 6 1 4 1 TA 7 Min2
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 1 TA Attchd 1977.000 Fin 2
## 2 0 NA Attchd 1993.000 RFn 2
## 3 1 TA Detchd 2000.000 Unf 2
## 4 1 Gd Attchd 2004.000 Fin 2
## 5 0 NA NA 1978.506 NA 0
## 6 1 Po Attchd 1969.000 RFn 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 529 TA TA Y 240 0
## 2 506 TA TA Y 144 70
## 3 440 TA TA Y 0 0
## 4 403 TA TA Y 100 35
## 5 0 NA NA Y 0 198
## 6 530 TA TA Y 305 189
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 NA NA NA
## 2 0 0 0 0 NA NA NA
## 3 0 0 0 0 NA NA NA
## 4 0 0 0 0 NA NA NA
## 5 30 0 0 0 NA MnPrv NA
## 6 0 0 0 0 NA MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 6 2009 WD Normal 215000
## 2 0 6 2008 WD Normal 187000
## 3 0 6 2007 WD Normal 177000
## 4 0 10 2006 WD Normal 178000
## 5 0 7 2009 WD Abnorml 104000
## 6 400 9 2008 WD Normal 172000
num_house_data_train <- train_house[ ,c('LotArea', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'SalePrice')]
head(num_house_data_train)
## LotArea MasVnrArea BsmtFinSF1 TotalBsmtSF GrLivArea GarageArea SalePrice
## 1 16635 126 1246 1602 1602 529 215000
## 2 9291 120 426 832 1710 506 187000
## 3 2117 456 436 756 1525 440 177000
## 4 9056 0 0 707 1414 403 178000
## 5 5520 0 0 755 2229 0 104000
## 6 11075 232 562 784 1968 530 172000
The correlation coefficients and the features determined in the literature as important (Sharma et al., 2024) were considered as options for inclusion in the development of the multiple regression model. Four models were created. The first model used only two predictors chosen with a moderate relationship between sale price (SalePrice) and variable value (garage area and general living area). The second model used variables with a weak relationship between sale price and variable value (finished basement square footage (BsmtFinSF1) and lot area (LotArea)) as a comparison. The third model was developed to investigate the impact of four variables, general living area, total basement square footage, finished basement square footage and lot area. Finally, the fourth model used three variables with a moderate relationship between sale price and value, general living area, total basement square footage, and garage area. Of the four models, the fourth model with three variables which demonstrate a moderate relationship between sale price and variable value demonstrated the highest performance.
house_cor <- cor(num_house_data_train)
house_cor %>%
kbl(caption = "Correlation Coefficients of Numeric Variables in Training Dataset") %>%
kable_classic()
| LotArea | MasVnrArea | BsmtFinSF1 | TotalBsmtSF | GrLivArea | GarageArea | SalePrice | |
|---|---|---|---|---|---|---|---|
| LotArea | 1.0000000 | 0.0647277 | 0.1886994 | 0.2230464 | 0.2368129 | 0.2015887 | 0.2732216 |
| MasVnrArea | 0.0647277 | 1.0000000 | 0.2420855 | 0.3297760 | 0.3402613 | 0.3703872 | 0.4479425 |
| BsmtFinSF1 | 0.1886994 | 0.2420855 | 1.0000000 | 0.4643250 | 0.1299623 | 0.3000188 | 0.4356013 |
| TotalBsmtSF | 0.2230464 | 0.3297760 | 0.4643250 | 1.0000000 | 0.3693650 | 0.4908337 | 0.6533336 |
| GrLivArea | 0.2368129 | 0.3402613 | 0.1299623 | 0.3693650 | 1.0000000 | 0.4544911 | 0.6935590 |
| GarageArea | 0.2015887 | 0.3703872 | 0.3000188 | 0.4908337 | 0.4544911 | 1.0000000 | 0.6501967 |
| SalePrice | 0.2732216 | 0.4479425 | 0.4356013 | 0.6533336 | 0.6935590 | 0.6501967 | 1.0000000 |
#The first model was developed using 2 numeric variables with the high correlation
model1 <- lm(SalePrice ~ GarageArea + GrLivArea, data=num_house_data_train)
anova(model1)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## GarageArea 1 2.5605e+12 2.5605e+12 1276.49 < 2.2e-16 ***
## GrLivArea 1 1.2095e+12 1.2095e+12 602.97 < 2.2e-16 ***
## Residuals 1140 2.2867e+12 2.0059e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model1)
##
## Call:
## lm(formula = SalePrice ~ GarageArea + GrLivArea, data = num_house_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -160486 -22420 -474 19953 310231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11867.111 4715.304 -2.517 0.012 *
## GarageArea 148.184 7.171 20.665 <2e-16 ***
## GrLivArea 81.126 3.304 24.555 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44790 on 1140 degrees of freedom
## Multiple R-squared: 0.6224, Adjusted R-squared: 0.6218
## F-statistic: 939.7 on 2 and 1140 DF, p-value: < 2.2e-16
#The second model was created with variables showing a lower correlation to inspect impact on F and p values
model2 <- lm(SalePrice ~ BsmtFinSF1 + LotArea, data=num_house_data_train)
anova(model2)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## BsmtFinSF1 1 1.1493e+12 1.1493e+12 280.048 < 2.2e-16 ***
## LotArea 1 2.2917e+11 2.2917e+11 55.844 1.558e-13 ***
## Residuals 1140 4.6784e+12 4.1038e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model2)
##
## Call:
## lm(formula = SalePrice ~ BsmtFinSF1 + LotArea, data = num_house_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -290840 -44665 -13722 35159 310557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.299e+05 3.474e+03 37.383 < 2e-16 ***
## BsmtFinSF1 6.730e+01 4.479e+00 15.024 < 2e-16 ***
## LotArea 1.858e+00 2.486e-01 7.473 1.56e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64060 on 1140 degrees of freedom
## Multiple R-squared: 0.2276, Adjusted R-squared: 0.2262
## F-statistic: 167.9 on 2 and 1140 DF, p-value: < 2.2e-16
#The third model was created with four numeric variables reported by Sharma et al. (2024) as important features to inspect impact of using 4 variables on model performance
model3 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + BsmtFinSF1 + LotArea, data = num_house_data_train)
anova(model3)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## GrLivArea 1 2.9135e+12 2.9135e+12 1794.0267 < 2e-16 ***
## TotalBsmtSF 1 1.1063e+12 1.1063e+12 681.2235 < 2e-16 ***
## BsmtFinSF1 1 1.8359e+11 1.8359e+11 113.0472 < 2e-16 ***
## LotArea 1 5.3612e+09 5.3612e+09 3.3013 0.06949 .
## Residuals 1138 1.8481e+12 1.6240e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model3)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + BsmtFinSF1 +
## LotArea, data = num_house_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -176355 -20187 2056 22238 218094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.299e+04 4.477e+03 -7.370 3.28e-13 ***
## GrLivArea 8.529e+01 2.899e+00 29.424 < 2e-16 ***
## TotalBsmtSF 6.427e+01 3.521e+00 18.254 < 2e-16 ***
## BsmtFinSF1 3.264e+01 3.147e+00 10.371 < 2e-16 ***
## LotArea 2.922e-01 1.608e-01 1.817 0.0695 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40300 on 1138 degrees of freedom
## Multiple R-squared: 0.6949, Adjusted R-squared: 0.6938
## F-statistic: 647.9 on 4 and 1138 DF, p-value: < 2.2e-16
#The fourth model was created with three numeric variables reported by Sharma et al. (2024) as important features to inspect impact of using 3 variables on model performance but eliminating Lot Area due to the low correlation
model4 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + GarageArea, data = num_house_data_train)
anova(model4)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## GrLivArea 1 2.9135e+12 2.9135e+12 1924.83 < 2.2e-16 ***
## TotalBsmtSF 1 1.1063e+12 1.1063e+12 730.89 < 2.2e-16 ***
## GarageArea 1 3.1302e+11 3.1302e+11 206.80 < 2.2e-16 ***
## Residuals 1139 1.7240e+12 1.5136e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model4)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + GarageArea,
## data = num_house_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163581 -18869 782 20619 256040
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -38239.016 4318.333 -8.855 <2e-16 ***
## GrLivArea 70.505 2.922 24.127 <2e-16 ***
## TotalBsmtSF 63.264 3.281 19.282 <2e-16 ***
## GarageArea 97.289 6.765 14.381 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38910 on 1139 degrees of freedom
## Multiple R-squared: 0.7154, Adjusted R-squared: 0.7146
## F-statistic: 954.2 on 3 and 1139 DF, p-value: < 2.2e-16
To validate the model and reduce the risk of over-fitting, 10-fold cross validation was performed on the training dataset. 10-fold cross validation was chosen as the preferred cross-validation method due to the low computational cost while continuing to re-sample data in 10 iterations for model training.
#Define a set of 10 cross-validation folds
house_folds <- vfold_cv(num_house_data_train, v = 10)
house_folds
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [1028/115]> Fold01
## 2 <split [1028/115]> Fold02
## 3 <split [1028/115]> Fold03
## 4 <split [1029/114]> Fold04
## 5 <split [1029/114]> Fold05
## 6 <split [1029/114]> Fold06
## 7 <split [1029/114]> Fold07
## 8 <split [1029/114]> Fold08
## 9 <split [1029/114]> Fold09
## 10 <split [1029/114]> Fold10
multipleModel <- linear_reg(mode = "regression", engine = "lm")
#Define a workflow for model 4
house_workflow <- workflow() %>%
add_model(multipleModel) %>%
add_formula(SalePrice ~ GrLivArea + TotalBsmtSF + GarageArea)
house_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Formula
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## SalePrice ~ GrLivArea + TotalBsmtSF + GarageArea
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
#Fit model 4 to each cross-validation fold
house_train_fit <-
house_workflow %>%
fit_resamples(house_folds)
house_train_fit
## # Resampling results
## # 10-fold cross-validation
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [1028/115]> Fold01 <tibble [2 × 4]> <tibble [0 × 4]>
## 2 <split [1028/115]> Fold02 <tibble [2 × 4]> <tibble [0 × 4]>
## 3 <split [1028/115]> Fold03 <tibble [2 × 4]> <tibble [0 × 4]>
## 4 <split [1029/114]> Fold04 <tibble [2 × 4]> <tibble [0 × 4]>
## 5 <split [1029/114]> Fold05 <tibble [2 × 4]> <tibble [0 × 4]>
## 6 <split [1029/114]> Fold06 <tibble [2 × 4]> <tibble [0 × 4]>
## 7 <split [1029/114]> Fold07 <tibble [2 × 4]> <tibble [0 × 4]>
## 8 <split [1029/114]> Fold08 <tibble [2 × 4]> <tibble [0 × 4]>
## 9 <split [1029/114]> Fold09 <tibble [2 × 4]> <tibble [0 × 4]>
## 10 <split [1029/114]> Fold10 <tibble [2 × 4]> <tibble [0 × 4]>
#Calculate mean metrics
collect_metrics(house_train_fit)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 38789. 10 1553. pre0_mod0_post0
## 2 rsq standard 0.715 10 0.0197 pre0_mod0_post0
The purpose of this project was to compare various multiple regression models to determine the combination of variables with the strongest relationship with house sale price for future use in a prediction model. The approach taken in this project was to first identify variables previously reported in the literature of having a relationship with house prices. Once a list of variables was obtained (Sharma et al., 2024), the correlation coefficients were then obtained for this dataset to support identification of possible variables.
The adjusted r-squared for the best performing model indicated that the variables of general living area, total basement square footage, and garage area accounted for 71.46% of the variance of the sale price. Use of these variables resulted in a model which performed moderately well, with a F-statistic of 954.2 and a p<0.00. Variables were chosen with a moderate correlation to the response variable of sale price but did not have high correlations with each other indicating each variable included in the model contributed to the variance in sale prices and added predictive value. Because a small number of variables were chosen for use in the model, many non-linear relationships were not included in this project.
Forys (2022) compared a multiple regression model with neural networks to compare model performance in house sale price predictions across model types. Forys (2022) explained multivariate regression models have a high risk of overfitting due to the addition of many variables leading to high variance. Forys compared neural networks with multivariate regression models, finding the benefit of analysis of non-linear relationships to increase model performance when using neural networks (Forys, 2022).
In this project, numerical variables supported feature selection in using correlational coefficients to identify variables of importance for use in multiple regression models. Models which incorporated variables with a weak relationship between house sale price and the value of the variable affected the adjusted r-squared and the F-statistic, while continuing to show a p-value < 0.00. This reflected the importance of inspecting more than the p-value and r-squared value in determining the performance of a model.
The limitations of this project included the small number of variables used in the four models developed and the inclusion of solely numeric variables. Further inspection of the dataset with incorporation of important categorical variables, such as quality rating, may improve the performance of the model and increase the predictive power. To reduce computational cost, a small number of variables were selected limiting the robustness of the data used. Increased model performance may be found when using variables not included in the four models developed for this project. Additionally, the residual degrees of freedom are high compared to the model degrees of freedom. This is attributed to the simplicity of the model and the small number of variables used. Although this reduced the computational cost, the low number of variables may risk underfitting. Future work will incorporate categorical data values and increase the number of variables to improve model performance.
Overall, the model developed for this project performed moderately well when combining four valuable variables in a multiple regression model to predict response variables of home sale price. This project demonstrates that a multiple regression model may be considered in the development of a model to predict the sale price of a house.
Works Cited
andradaolteanu. (2019, August 18). Housing Prices Competition - Iowa Dataset. Kaggle.com; Kaggle. https://www.kaggle.com/code/andradaolteanu/housing-prices-competition-iowa-dataset?select=test.csv
Forys, I. (2022). Machine learning in house price analysis: regression models versus neural networks. Procedia Computer Science, 207, 435–445. https://doi.org/10.1016/j.procs.2022.09.078
Sharma, H., Harsora, H., & Ogunleye, B. (2024). An Optimal House Price Prediction Algorithm: XGBoost. Analytics, 3(1), 30–45. https://doi.org/10.3390/analytics3010003
Sharma, S., Arora, D., Shankar, G., Sharma, P., & Motwani, V. (2023). House Price Prediction using Machine Learning Algorithm. Proceedings - 7th International Conference on Computing Methodologies and Communication, ICCMC 2023, 982– 986. https://doi.org/10.1109/ICCMC56507.2023.10084197
Zach. (2020, August 6). How to Remove Outliers in R. Statology. https://www.statology.org/remove-outliers-r/
citation("tidyverse")
## To cite package 'tidyverse' in publications use:
##
## Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R,
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller
## E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V,
## Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to
## the tidyverse." _Journal of Open Source Software_, *4*(43), 1686.
## doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {Welcome to the {tidyverse}},
## author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
## year = {2019},
## journal = {Journal of Open Source Software},
## volume = {4},
## number = {43},
## pages = {1686},
## doi = {10.21105/joss.01686},
## }
citation("dplyr")
## To cite package 'dplyr' in publications use:
##
## Wickham H, François R, Henry L, Müller K, Vaughan D (2023). _dplyr: A
## Grammar of Data Manipulation_. doi:10.32614/CRAN.package.dplyr
## <https://doi.org/10.32614/CRAN.package.dplyr>, R package version
## 1.1.4, <https://CRAN.R-project.org/package=dplyr>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {dplyr: A Grammar of Data Manipulation},
## author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan},
## year = {2023},
## note = {R package version 1.1.4},
## url = {https://CRAN.R-project.org/package=dplyr},
## doi = {10.32614/CRAN.package.dplyr},
## }
citation("readr")
## To cite package 'readr' in publications use:
##
## Wickham H, Hester J, Bryan J (2024). _readr: Read Rectangular Text
## Data_. doi:10.32614/CRAN.package.readr
## <https://doi.org/10.32614/CRAN.package.readr>, R package version
## 2.1.5, <https://CRAN.R-project.org/package=readr>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {readr: Read Rectangular Text Data},
## author = {Hadley Wickham and Jim Hester and Jennifer Bryan},
## year = {2024},
## note = {R package version 2.1.5},
## url = {https://CRAN.R-project.org/package=readr},
## doi = {10.32614/CRAN.package.readr},
## }
citation("boot")
## To cite the 'boot' package in publications use:
##
## Angelo Canty and Brian Ripley (2024). boot: Bootstrap R (S-Plus)
## Functions. R package version 1.3-31.
##
## Davison, A. C. & Hinkley, D. V. (1997) Bootstrap Methods and Their
## Applications. Cambridge University Press, Cambridge. ISBN
## 0-521-57391-2
##
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
citation("ggplot2")
## To cite ggplot2 in publications, please use
##
## H. Wickham. ggplot2: Elegant Graphics for Data Analysis.
## Springer-Verlag New York, 2016.
##
## A BibTeX entry for LaTeX users is
##
## @Book{,
## author = {Hadley Wickham},
## title = {ggplot2: Elegant Graphics for Data Analysis},
## publisher = {Springer-Verlag New York},
## year = {2016},
## isbn = {978-3-319-24277-4},
## url = {https://ggplot2.tidyverse.org},
## }
citation("moments")
## To cite package 'moments' in publications use:
##
## Komsta L, Novomestky F (2022). _moments: Moments, Cumulants,
## Skewness, Kurtosis and Related Tests_.
## doi:10.32614/CRAN.package.moments
## <https://doi.org/10.32614/CRAN.package.moments>, R package version
## 0.14.1, <https://CRAN.R-project.org/package=moments>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {moments: Moments, Cumulants, Skewness, Kurtosis and Related Tests},
## author = {Lukasz Komsta and Frederick Novomestky},
## year = {2022},
## note = {R package version 0.14.1},
## url = {https://CRAN.R-project.org/package=moments},
## doi = {10.32614/CRAN.package.moments},
## }
##
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
citation("scales")
## To cite package 'scales' in publications use:
##
## Wickham H, Pedersen T, Seidel D (2025). _scales: Scale Functions for
## Visualization_. doi:10.32614/CRAN.package.scales
## <https://doi.org/10.32614/CRAN.package.scales>, R package version
## 1.4.0, <https://CRAN.R-project.org/package=scales>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {scales: Scale Functions for Visualization},
## author = {Hadley Wickham and Thomas Lin Pedersen and Dana Seidel},
## year = {2025},
## note = {R package version 1.4.0},
## url = {https://CRAN.R-project.org/package=scales},
## doi = {10.32614/CRAN.package.scales},
## }
citation("kableExtra")
## To cite package 'kableExtra' in publications use:
##
## Zhu H (2024). _kableExtra: Construct Complex Table with 'kable' and
## Pipe Syntax_. doi:10.32614/CRAN.package.kableExtra
## <https://doi.org/10.32614/CRAN.package.kableExtra>, R package version
## 1.4.0, <https://CRAN.R-project.org/package=kableExtra>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {kableExtra: Construct Complex Table with 'kable' and Pipe Syntax},
## author = {Hao Zhu},
## year = {2024},
## note = {R package version 1.4.0},
## url = {https://CRAN.R-project.org/package=kableExtra},
## doi = {10.32614/CRAN.package.kableExtra},
## }
citation("rsample")
## To cite package 'rsample' in publications use:
##
## Frick H, Chow F, Kuhn M, Mahoney M, Silge J, Wickham H (2025).
## _rsample: General Resampling Infrastructure_.
## doi:10.32614/CRAN.package.rsample
## <https://doi.org/10.32614/CRAN.package.rsample>, R package version
## 1.3.1, <https://CRAN.R-project.org/package=rsample>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {rsample: General Resampling Infrastructure},
## author = {Hannah Frick and Fanny Chow and Max Kuhn and Michael Mahoney and Julia Silge and Hadley Wickham},
## year = {2025},
## note = {R package version 1.3.1},
## url = {https://CRAN.R-project.org/package=rsample},
## doi = {10.32614/CRAN.package.rsample},
## }
citation("tidymodels")
## To cite package 'tidymodels' in publications use:
##
## Kuhn et al., (2020). Tidymodels: a collection of packages for
## modeling and machine learning using tidyverse principles.
## https://www.tidymodels.org
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.},
## author = {Max Kuhn and Hadley Wickham},
## url = {https://www.tidymodels.org},
## year = {2020},
## }
citation("parsnip")
## To cite package 'parsnip' in publications use:
##
## Kuhn M, Vaughan D (2025). _parsnip: A Common API to Modeling and
## Analysis Functions_. doi:10.32614/CRAN.package.parsnip
## <https://doi.org/10.32614/CRAN.package.parsnip>, R package version
## 1.4.0, <https://CRAN.R-project.org/package=parsnip>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {parsnip: A Common API to Modeling and Analysis Functions},
## author = {Max Kuhn and Davis Vaughan},
## year = {2025},
## note = {R package version 1.4.0},
## url = {https://CRAN.R-project.org/package=parsnip},
## doi = {10.32614/CRAN.package.parsnip},
## }
citation("caret")
## To cite caret in publications use:
##
## Kuhn, M. (2008). Building Predictive Models in R Using the caret
## Package. Journal of Statistical Software, 28(5), 1–26.
## https://doi.org/10.18637/jss.v028.i05
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {Building Predictive Models in R Using the caret Package},
## volume = {28},
## url = {https://www.jstatsoft.org/index.php/jss/article/view/v028i05},
## doi = {10.18637/jss.v028.i05},
## number = {5},
## journal = {Journal of Statistical Software},
## author = {{Kuhn} and {Max}},
## year = {2008},
## pages = {1–26},
## }
citation("yardstick")
## To cite package 'yardstick' in publications use:
##
## Kuhn M, Vaughan D, Hvitfeldt E (2025). _yardstick: Tidy
## Characterizations of Model Performance_.
## doi:10.32614/CRAN.package.yardstick
## <https://doi.org/10.32614/CRAN.package.yardstick>, R package version
## 1.3.2, <https://CRAN.R-project.org/package=yardstick>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {yardstick: Tidy Characterizations of Model Performance},
## author = {Max Kuhn and Davis Vaughan and Emil Hvitfeldt},
## year = {2025},
## note = {R package version 1.3.2},
## url = {https://CRAN.R-project.org/package=yardstick},
## doi = {10.32614/CRAN.package.yardstick},
## }