The purpose of this project is to determine optimal feature combinations for a multiple regression prediction of home sales prices.

The Iowa Homes dataset was used, obtained from Kaggle.com (andradaotenau, 2019) and uploaded in R Studio, an interactive development environment (IDE) allowing for both code inspection and the inclusion of descriptions in the RMarkdown file accompanying this report. The data had been previously split into a train.csv for the training set and a test.csv for testing models developed. For this project, the train.csv file was used but additionally split for cross validation as discussed later in this report.

#Load packages
library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.1
## Warning: package 'purrr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
library(boot)
library(ggplot2)
library(moments)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.1
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(rsample)
## Warning: package 'rsample' was built under R version 4.5.2
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.5.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.11     ✔ tailor       0.1.0 
## ✔ dials        1.4.2      ✔ tune         2.0.1 
## ✔ infer        1.1.0      ✔ workflows    1.3.0 
## ✔ modeldata    1.5.1      ✔ workflowsets 1.1.1 
## ✔ parsnip      1.4.0      ✔ yardstick    1.3.2 
## ✔ recipes      1.3.1
## Warning: package 'broom' was built under R version 4.5.2
## Warning: package 'dials' was built under R version 4.5.2
## Warning: package 'modeldata' was built under R version 4.5.2
## Warning: package 'parsnip' was built under R version 4.5.2
## Warning: package 'tailor' was built under R version 4.5.2
## Warning: package 'tune' was built under R version 4.5.2
## Warning: package 'workflows' was built under R version 4.5.2
## Warning: package 'workflowsets' was built under R version 4.5.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()        masks purrr::discard()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ recipes::fixed()         masks stringr::fixed()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ yardstick::spec()        masks readr::spec()
## ✖ recipes::step()          masks stats::step()
library(parsnip)
library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:rsample':
## 
##     calibration
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(yardstick)
#Upload dataset from Kaggle.com: andradaolteanu. (2019, August 18). Housing Prices Competition - Iowa Dataset. Kaggle.com; Kaggle. https://www.kaggle.com/code/andradaolteanu/housing-prices-competition-iowa-dataset?select=test.csv
getwd()
## [1] "C:/Users/benke/Downloads/New folder (3)"
setwd("C:/Users/benke/Downloads/New folder (3)")
house2 <- read.csv("Iowa.house.train.csv")
head(house2)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1       0      2   2008       WD        Normal    208500
## 2       0      5   2007       WD        Normal    181500
## 3       0      9   2008       WD        Normal    223500
## 4       0      2   2006       WD       Abnorml    140000
## 5       0     12   2008       WD        Normal    250000
## 6     700     10   2009       WD        Normal    143000

The dataset consists of 81 columns and 1460 observations. Of the 81 variables, 43 were nominal, character data types, and 38 were numeric. Of the 38 numeric variables, three variables were rating scales and were reclassified as factors for to ensure these variables were not mistakenly included in quantitative analysis as numeric data rather than the ordinal, categorical nature of the data type.

#View characteristics of the data
glimpse(house2)
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
## $ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
## $ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
## $ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
## $ LotConfig     <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
## $ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
## $ Neighborhood  <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
## $ Condition1    <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
## $ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
## $ HouseStyle    <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
## $ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
## $ Exterior1st   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
## $ Exterior2nd   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
## $ MasVnrType    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ Foundation    <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
## $ BsmtQual      <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
## $ BsmtCond      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
## $ BsmtExposure  <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
## $ BsmtFinType1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
## $ HeatingQC     <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
## $ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
## $ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
## $ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
## $ MiscFeature   <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
str(house2)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
#View summary statistics to identify further pre-processing requirements, such as possible outliers and missing values.
summary(house2)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

Data inspection revealed a total NA count of 6965 NAs in the dataset, with 100% of observations having at least 1 NA listed. In reviewing variables, 19 columns for a total of 23% of columns also included NAs. Due to the high number of NAs, mean and mode imputation was used to remove NAs for preprocessing.

#Count NAs
total_na_count <- sum(is.na(house2))
total_na_count
## [1] 6965
#Count rows with NA and calculate percentage of rows with NAs
rows_with_nas <- sum(rowSums(is.na(house2)) > 0)
Percent_row_NA <- percent(rows_with_nas / nrow(house2)) 
rows_with_nas
## [1] 1460
Percent_row_NA
## [1] "100%"
#Count columns with NA and calculate percentage of columns with NAs
cols_with_nas <- sum(colSums(is.na(house2)) > 0)
Percent_col_NA <- percent(cols_with_nas / length(house2))
cols_with_nas
## [1] 19
Percent_col_NA
## [1] "23%"
#Categorical columns using a number as a quality rating or category, such as month sold, are converted to factors
factor_columns <- c("OverallQual", "OverallCond", "MoSold")
house2[factor_columns] <- lapply(house2[factor_columns], function(col) as.factor(as.character(col)))
# Replace NAs with appropriate values
# Numeric: Replace with the mean if sufficient data is available
# Categorical: Replace with the mode (most common value)
# Character: Replace with the string "NA"
house2 <- lapply(house2, function(col) {
  if (is.numeric(col) || is.integer(col)) { # Numeric or integer columns
    if (sum(!is.na(col)) > 10) {
      col[is.na(col)] <- mean(col, na.rm = TRUE) # Replace with mean
    } else {
      col[is.na(col)] <- approx(seq_along(col), col, n = length(col))[["y"]][is.na(col)] # Interpolation
    }
  } else if (is.factor(col)) { # Factor columns
    mode_val <- names(sort(-table(col)))[1] # Mode (most common value)
    col[is.na(col)] <- mode_val
  } else if (is.character(col)) { # Character columns
    col[is.na(col)] <- "NA" # Replace with "NA"
  }
  return(col) # Return the modified column
})

house2 <- as.data.frame(house2) # Convert the list back to a dataframe


#
# following the above method to impute, has now changed some of the statistics


# Check the updated dataset and ensure no remaining NAs
summary(house2)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 60.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 70.05  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 79.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                                      
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual   OverallCond    YearBuilt     YearRemodAdd 
##  Length:1460        5      :397   5      :821   Min.   :1872   Min.   :1950  
##  Class :character   6      :374   6      :252   1st Qu.:1954   1st Qu.:1967  
##  Mode  :character   7      :319   7      :205   Median :1973   Median :1994  
##                     8      :168   8      : 72   Mean   :1971   Mean   :1985  
##                     4      :116   4      : 57   3rd Qu.:2000   3rd Qu.:2004  
##                     9      : 43   3      : 25   Max.   :2010   Max.   :2010  
##                     (Other): 43   (Other): 28                                
##   RoofStyle           RoofMatl         Exterior1st        Exterior2nd       
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   MasVnrType          MasVnrArea      ExterQual          ExterCond        
##  Length:1460        Min.   :   0.0   Length:1460        Length:1460       
##  Class :character   1st Qu.:   0.0   Class :character   Class :character  
##  Mode  :character   Median :   0.0   Mode  :character   Mode  :character  
##                     Mean   : 103.7                                        
##                     3rd Qu.: 164.2                                        
##                     Max.   :1600.0                                        
##                                                                           
##   Foundation          BsmtQual           BsmtCond         BsmtExposure      
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtFinType1         BsmtFinSF1     BsmtFinType2         BsmtFinSF2     
##  Length:1460        Min.   :   0.0   Length:1460        Min.   :   0.00  
##  Class :character   1st Qu.:   0.0   Class :character   1st Qu.:   0.00  
##  Mode  :character   Median : 383.5   Mode  :character   Median :   0.00  
##                     Mean   : 443.6                      Mean   :  46.55  
##                     3rd Qu.: 712.2                      3rd Qu.:   0.00  
##                     Max.   :5644.0                      Max.   :1474.00  
##                                                                          
##    BsmtUnfSF       TotalBsmtSF       Heating           HeatingQC        
##  Min.   :   0.0   Min.   :   0.0   Length:1460        Length:1460       
##  1st Qu.: 223.0   1st Qu.: 795.8   Class :character   Class :character  
##  Median : 477.5   Median : 991.5   Mode  :character   Mode  :character  
##  Mean   : 567.2   Mean   :1057.4                                        
##  3rd Qu.: 808.0   3rd Qu.:1298.2                                        
##  Max.   :2336.0   Max.   :6110.0                                        
##                                                                         
##   CentralAir         Electrical          X1stFlrSF      X2ndFlrSF   
##  Length:1460        Length:1460        Min.   : 334   Min.   :   0  
##  Class :character   Class :character   1st Qu.: 882   1st Qu.:   0  
##  Mode  :character   Mode  :character   Median :1087   Median :   0  
##                                        Mean   :1163   Mean   : 347  
##                                        3rd Qu.:1391   3rd Qu.: 728  
##                                        Max.   :4692   Max.   :2065  
##                                                                     
##   LowQualFinSF       GrLivArea     BsmtFullBath     BsmtHalfBath    
##  Min.   :  0.000   Min.   : 334   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  0.000   Median :1464   Median :0.0000   Median :0.00000  
##  Mean   :  5.845   Mean   :1515   Mean   :0.4253   Mean   :0.05753  
##  3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :572.000   Max.   :5642   Max.   :3.0000   Max.   :2.00000  
##                                                                     
##     FullBath        HalfBath       BedroomAbvGr    KitchenAbvGr  
##  Min.   :0.000   Min.   :0.0000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :2.000   Median :0.0000   Median :3.000   Median :1.000  
##  Mean   :1.565   Mean   :0.3829   Mean   :2.866   Mean   :1.047  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000  
##  Max.   :3.000   Max.   :2.0000   Max.   :8.000   Max.   :3.000  
##                                                                  
##  KitchenQual         TotRmsAbvGrd     Functional          Fireplaces   
##  Length:1460        Min.   : 2.000   Length:1460        Min.   :0.000  
##  Class :character   1st Qu.: 5.000   Class :character   1st Qu.:0.000  
##  Mode  :character   Median : 6.000   Mode  :character   Median :1.000  
##                     Mean   : 6.518                      Mean   :0.613  
##                     3rd Qu.: 7.000                      3rd Qu.:1.000  
##                     Max.   :14.000                      Max.   :3.000  
##                                                                        
##  FireplaceQu         GarageType         GarageYrBlt   GarageFinish      
##  Length:1460        Length:1460        Min.   :1900   Length:1460       
##  Class :character   Class :character   1st Qu.:1962   Class :character  
##  Mode  :character   Mode  :character   Median :1979   Mode  :character  
##                                        Mean   :1979                     
##                                        3rd Qu.:2001                     
##                                        Max.   :2010                     
##                                                                         
##    GarageCars      GarageArea      GarageQual         GarageCond       
##  Min.   :0.000   Min.   :   0.0   Length:1460        Length:1460       
##  1st Qu.:1.000   1st Qu.: 334.5   Class :character   Class :character  
##  Median :2.000   Median : 480.0   Mode  :character   Mode  :character  
##  Mean   :1.767   Mean   : 473.0                                        
##  3rd Qu.:2.000   3rd Qu.: 576.0                                        
##  Max.   :4.000   Max.   :1418.0                                        
##                                                                        
##   PavedDrive          WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Length:1460        Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Class :character   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Median :  0.00   Median : 25.00   Median :  0.00  
##                     Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##                     3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##                     Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                       
##    X3SsnPorch      ScreenPorch        PoolArea          PoolQC         
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Length:1460       
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   Class :character  
##  Median :  0.00   Median :  0.00   Median :  0.000   Mode  :character  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759                     
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000                     
##  Max.   :508.00   Max.   :480.00   Max.   :738.000                     
##                                                                        
##     Fence           MiscFeature           MiscVal             MoSold   
##  Length:1460        Length:1460        Min.   :    0.00   6      :253  
##  Class :character   Class :character   1st Qu.:    0.00   7      :234  
##  Mode  :character   Mode  :character   Median :    0.00   5      :204  
##                                        Mean   :   43.49   4      :141  
##                                        3rd Qu.:    0.00   8      :122  
##                                        Max.   :15500.00   3      :106  
##                                                           (Other):400  
##      YrSold       SaleType         SaleCondition        SalePrice     
##  Min.   :2006   Length:1460        Length:1460        Min.   : 34900  
##  1st Qu.:2007   Class :character   Class :character   1st Qu.:129975  
##  Median :2008   Mode  :character   Mode  :character   Median :163000  
##  Mean   :2008                                         Mean   :180921  
##  3rd Qu.:2009                                         3rd Qu.:214000  
##  Max.   :2010                                         Max.   :755000  
## 

The presence of outliers was also investigated and managed in the pre-processing step. Box plot visualization was used for four variables (lot area, total basement square footage, basement square footage (BsmtFinSF1) and general living area (GrLivArea)) previously reported as important features for the prediction of home sale prices (Sharma et al., 2024). Box plot visualization indicated outlier data points may affect model performance. To eliminate outliers for model training, observations with a general living area that was either less than 1.5IQR1 or more than 1.5IQR3 were removed from the dataset (Zach, 2020). General living area was chosen as the feature for eliminating outliers due to the likelihood of omitting very large or very small houses from the dataset, which may negatively affect model training. Once the outliers were removed, the dataset was split into training and testing datasets.

#Boxplot visualizations for variables with showing max values appearing far higher than 3rd quartile to inspect for outliers
    col = "LotArea"

    
  if (is.factor(house2[[col]])) { # if the col is categorical, then the code will
      # create two graphs the Bar graph 
      # Highlight and run until the line that start with `# Boxplot for numeric variables
      #
      # If the col is numeric, then it will create the histogram
      # Bar graph for factors
      ggplot(house2, aes(x = .data[[col]], fill = .data[[col]])) +
        geom_bar() +
       labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
        theme_minimal() +
        theme(legend.position = "right")
    } else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
      
     ggplot(house2, aes(x = .data[[col]])) +
      geom_histogram(binwidth = 0.3) +
       labs(title = paste("Histogram for", col), x = col, y = "Count") +
        theme_minimal()
    }  

 col = "LotArea"
 
  ggplot(house2, aes(x = "", y = .data[[col]])) +
  geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
  labs(
    title = paste("Box Plot for", col),
    x = NULL,
    y = "Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )

    col = "BsmtFinSF1"

    
  if (is.factor(house2[[col]])) { # if the col is categorical, then the code will
      # create two graphs the Bar graph 
      # Highlight and run until the line that start with `# Boxplot for numeric variables
      #
      # If the col is numeric, then it will create the histogram
      # Bar graph for factors
      ggplot(house2, aes(x = .data[[col]], fill = .data[[col]])) +
        geom_bar() +
       labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
        theme_minimal() +
        theme(legend.position = "right")
    } else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
      
     ggplot(house2, aes(x = .data[[col]])) +
      geom_histogram(binwidth = 0.3) +
       labs(title = paste("Histogram for", col), x = col, y = "Count") +
        theme_minimal()
    }  

 col = "BsmtFinSF1"
 
  ggplot(house2, aes(x = "", y = .data[[col]])) +
  geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
  labs(
    title = paste("Box Plot for", col),
    x = NULL,
    y = "Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )

 col = "TotalBsmtSF"

    
  if (is.factor(house2[[col]])) { # if the col is categorical, then the code will
      # create two graphs the Bar graph 
      # Highlight and run until the line that start with `# Boxplot for numeric variables
      #
      # If the col is numeric, then it will create the histogram
      # Bar graph for factors
      ggplot(house2, aes(x = .data[[col]], fill = .data[[col]])) +
        geom_bar() +
       labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
        theme_minimal() +
        theme(legend.position = "right")
    } else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
      
     ggplot(house2, aes(x = .data[[col]])) +
      geom_histogram(binwidth = 0.3) +
       labs(title = paste("Histogram for", col), x = col, y = "Count") +
        theme_minimal()
    } 

  ggplot(house2, aes(x = "", y = .data[[col]])) +
  geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
  labs(
    title = paste("Box Plot for", col),
    x = NULL,
    y = "Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )

col = "GrLivArea"

    
  if (is.factor(house2[[col]])) { # if the col is categorical, then the code will
      # create two graphs the Bar graph 
      # Highlight and run until the line that start with `# Boxplot for numeric variables
      #
      # If the col is numeric, then it will create the histogram
      # Bar graph for factors
      ggplot(house2, aes(x = .data[[col]], fill = .data[[col]])) +
        geom_bar() +
       labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
        theme_minimal() +
        theme(legend.position = "right")
    } else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
      
     ggplot(house2, aes(x = .data[[col]])) +
      geom_histogram(binwidth = 0.3) +
       labs(title = paste("Histogram for", col), x = col, y = "Count") +
        theme_minimal()
    } 

  ggplot(house2, aes(x = "", y = .data[[col]])) +
  geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
  labs(
    title = paste("Box Plot for", col),
    x = NULL,
    y = "Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )

#Identify and remove outliers Zach. (2020, August 6). How to Remove Outliers in R. Statology. https://www.statology.org/remove-outliers-r/
#Specify interquartile range for numeric variables with outliers
Q1 <- quantile(house2$GrLivArea, .25)
Q3 <- quantile(house2$GrLivArea, .75)
IQR <- IQR(house2$GrLivArea)

#Keep rows without outliers (values within 1.5*IQR of Q1 and Q3)
house2_no_out <- subset(house2, house2$GrLivArea> (Q1 - 1.5*IQR) & house2$GrLivArea< (Q3 + 1.5*IQR))

#View new dataset to determine data loss
dim(house2_no_out)
## [1] 1429   81
col = "GrLivArea"

    
  if (is.factor(house2_no_out[[col]])) { # if the col is categorical, then the code will
      # create two graphs the Bar graph 
      # Highlight and run until the line that start with `# Boxplot for numeric variables
      #
      # If the col is numeric, then it will create the histogram
      # Bar graph for factors
      ggplot(house2_no_out, aes(x = .data[[col]], fill = .data[[col]])) +
        geom_bar() +
       labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
        theme_minimal() +
        theme(legend.position = "right")
    } else if (is.numeric(house2[[col]]) || is.integer(house2[[col]])) {
      
     ggplot(house2_no_out, aes(x = .data[[col]])) +
      geom_histogram(binwidth = 0.3) +
       labs(title = paste("Histogram for", col), x = col, y = "Count") +
        theme_minimal()
    } 

  ggplot(house2_no_out, aes(x = "", y = .data[[col]])) +
  geom_boxplot(fill = "skyblue", color = "darkblue", width = .25, outlier.color = "red", outlier.size = 2) +
  labs(
    title = paste("Box Plot for", col),
    x = NULL,
    y = "Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title.y = element_text(size = 14),
    axis.text.y = element_text(size = 12)
  )

str(house2_no_out)
## 'data.frame':    1429 obs. of  81 variables:
##  $ Id           : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : num  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : num  65 80 68 60 84 ...
##  $ LotArea      : num  8450 9600 11250 9550 14260 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  "NA" "NA" "NA" "NA" ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : Factor w/ 10 levels "1","10","2","3",..: 8 7 8 8 9 6 9 8 8 6 ...
##  $ OverallCond  : Factor w/ 9 levels "1","2","3","4",..: 5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : num  2003 1976 2001 1915 2000 ...
##  $ YearRemodAdd : num  2003 1976 2002 1970 2000 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : num  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : num  706 978 486 216 655 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : num  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : num  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : num  856 1262 920 756 1145 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : num  856 1262 920 961 1145 ...
##  $ X2ndFlrSF    : num  854 0 866 756 1053 ...
##  $ LowQualFinSF : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : num  1710 1262 1786 1717 2198 ...
##  $ BsmtFullBath : num  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : num  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : num  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : num  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : num  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : num  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : num  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  "NA" "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : num  2003 1976 2001 1998 2000 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : num  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : num  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : num  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : num  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: num  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : num  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  "NA" "NA" "NA" "NA" ...
##  $ Fence        : chr  "NA" "NA" "NA" "NA" ...
##  $ MiscFeature  : chr  "NA" "NA" "NA" "NA" ...
##  $ MiscVal      : num  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : Factor w/ 12 levels "1","10","11",..: 5 8 12 5 4 2 11 3 7 1 ...
##  $ YrSold       : num  2008 2007 2008 2006 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : num  208500 181500 223500 140000 250000 ...

Following pre-processing, the dataset was split into a training and a testing dataset with 80% training data and 20% testing data. To identify relationships between variables for use in the multiple regression model, a subset of numeric variables, including variables reported by Sharma et al, (2024) as important features, were combined in a data frame for calculation of a correlation table.

#Divide train dataset into a training and a testing dataset for cross-validation.
set.seed(123)
house2_no_out_split <- initial_split(house2_no_out, prop = .80)

train_house <- training(house2_no_out_split)
test_house <- testing(house2_no_out_split)

head(train_house)
##    Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 422         20       RL    70.04996   16635   Pave    NA      IR1         Lvl
## 2 470         60       RL    76.00000    9291   Pave    NA      IR1         Lvl
## 3 181        160       FV    70.04996    2117   Pave    NA      Reg         Lvl
## 4 535         60       RL    74.00000    9056   Pave    NA      IR1         Lvl
## 5 199         75       RM    92.00000    5520   Pave    NA      Reg         Lvl
## 6 954         60       RL    70.04996   11075   Pave    NA      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub       FR2       Gtl       NWAmes       Norm       Norm     1Fam
## 2    AllPub    Corner       Gtl      SawyerW       RRNe       Norm     1Fam
## 3    AllPub    Inside       Gtl      Somerst       Norm       Norm    Twnhs
## 4    AllPub    Inside       Gtl      Gilbert       Norm       Norm     1Fam
## 5    AllPub    Corner       Gtl      OldTown       Norm       Norm     1Fam
## 6    AllPub    Inside       Mod      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     1Story           6           7      1977         2000     Gable  CompShg
## 2     2Story           6           5      1993         1993     Gable  CompShg
## 3     2Story           6           5      2000         2000     Gable  CompShg
## 4     2Story           8           5      2004         2004     Gable  CompShg
## 5     2.5Fin           6           6      1912         1950     Gable  CompShg
## 6     2Story           5           4      1969         1969     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     CemntBd     CmentBd      Stone        126        Gd        TA     CBlock
## 2     HdBoard     HdBoard    BrkFace        120        Gd        TA      PConc
## 3     MetalSd     MetalSd    BrkFace        456        Gd        TA      PConc
## 4     VinylSd     VinylSd       None          0        Gd        TA      PConc
## 5     Wd Sdng     Wd Sdng       None          0        TA        TA     CBlock
## 6     HdBoard     HdBoard    BrkFace        232        TA        TA     CBlock
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          ALQ       1246          Unf
## 2       Gd       TA           No          GLQ        426          Unf
## 3       Gd       TA           No          GLQ        436          Unf
## 4       Ex       Gd           Av          Unf          0          Unf
## 5       TA       TA           No          Unf          0          Unf
## 6       TA       TA           Av          ALQ        562          LwQ
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       356        1602    GasA        Gd          Y      SBrkr
## 2          0       406         832    GasA        Ex          Y      SBrkr
## 3          0       320         756    GasA        Ex          Y      SBrkr
## 4          0       707         707    GasA        Ex          Y      SBrkr
## 5          0       755         755    GasA        Ex          Y      SBrkr
## 6        193        29         784    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1      1602         0            0      1602            0            1        2
## 2       832       878            0      1710            0            0        2
## 3       769       756            0      1525            0            0        2
## 4       707       707            0      1414            0            0        2
## 5       929       929          371      2229            0            0        1
## 6      1168       800            0      1968            0            1        2
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        0            3            1          Gd            8        Typ
## 2        1            3            1          Gd            7        Typ
## 3        1            3            1          Gd            5        Typ
## 4        1            3            1          Gd            6        Typ
## 5        0            5            1          TA            8        Typ
## 6        1            4            1          TA            7       Min2
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          1          TA     Attchd    1977.000          Fin          2
## 2          0          NA     Attchd    1993.000          RFn          2
## 3          1          TA     Detchd    2000.000          Unf          2
## 4          1          Gd     Attchd    2004.000          Fin          2
## 5          0          NA         NA    1978.506           NA          0
## 6          1          Po     Attchd    1969.000          RFn          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        529         TA         TA          Y        240           0
## 2        506         TA         TA          Y        144          70
## 3        440         TA         TA          Y          0           0
## 4        403         TA         TA          Y        100          35
## 5          0         NA         NA          Y          0         198
## 6        530         TA         TA          Y        305         189
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0     NA    NA          NA
## 2             0          0           0        0     NA    NA          NA
## 3             0          0           0        0     NA    NA          NA
## 4             0          0           0        0     NA    NA          NA
## 5            30          0           0        0     NA MnPrv          NA
## 6             0          0           0        0     NA MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1       0      6   2009       WD        Normal    215000
## 2       0      6   2008       WD        Normal    187000
## 3       0      6   2007       WD        Normal    177000
## 4       0     10   2006       WD        Normal    178000
## 5       0      7   2009       WD       Abnorml    104000
## 6     400      9   2008       WD        Normal    172000
num_house_data_train <- train_house[ ,c('LotArea', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'SalePrice')]
head(num_house_data_train)
##   LotArea MasVnrArea BsmtFinSF1 TotalBsmtSF GrLivArea GarageArea SalePrice
## 1   16635        126       1246        1602      1602        529    215000
## 2    9291        120        426         832      1710        506    187000
## 3    2117        456        436         756      1525        440    177000
## 4    9056          0          0         707      1414        403    178000
## 5    5520          0          0         755      2229          0    104000
## 6   11075        232        562         784      1968        530    172000

The correlation coefficients and the features determined in the literature as important (Sharma et al., 2024) were considered as options for inclusion in the development of the multiple regression model. Four models were created. The first model used only two predictors chosen with a moderate relationship between sale price (SalePrice) and variable value (garage area and general living area). The second model used variables with a weak relationship between sale price and variable value (finished basement square footage (BsmtFinSF1) and lot area (LotArea)) as a comparison. The third model was developed to investigate the impact of four variables, general living area, total basement square footage, finished basement square footage and lot area. Finally, the fourth model used three variables with a moderate relationship between sale price and value, general living area, total basement square footage, and garage area. Of the four models, the fourth model with three variables which demonstrate a moderate relationship between sale price and variable value demonstrated the highest performance.

house_cor <- cor(num_house_data_train)
house_cor %>%
  kbl(caption = "Correlation Coefficients of Numeric Variables in Training Dataset") %>%
  kable_classic()
Correlation Coefficients of Numeric Variables in Training Dataset
LotArea MasVnrArea BsmtFinSF1 TotalBsmtSF GrLivArea GarageArea SalePrice
LotArea 1.0000000 0.0647277 0.1886994 0.2230464 0.2368129 0.2015887 0.2732216
MasVnrArea 0.0647277 1.0000000 0.2420855 0.3297760 0.3402613 0.3703872 0.4479425
BsmtFinSF1 0.1886994 0.2420855 1.0000000 0.4643250 0.1299623 0.3000188 0.4356013
TotalBsmtSF 0.2230464 0.3297760 0.4643250 1.0000000 0.3693650 0.4908337 0.6533336
GrLivArea 0.2368129 0.3402613 0.1299623 0.3693650 1.0000000 0.4544911 0.6935590
GarageArea 0.2015887 0.3703872 0.3000188 0.4908337 0.4544911 1.0000000 0.6501967
SalePrice 0.2732216 0.4479425 0.4356013 0.6533336 0.6935590 0.6501967 1.0000000
#The first model was developed using 2 numeric variables with the high correlation
model1 <- lm(SalePrice ~ GarageArea + GrLivArea, data=num_house_data_train)
anova(model1)
## Analysis of Variance Table
## 
## Response: SalePrice
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## GarageArea    1 2.5605e+12 2.5605e+12 1276.49 < 2.2e-16 ***
## GrLivArea     1 1.2095e+12 1.2095e+12  602.97 < 2.2e-16 ***
## Residuals  1140 2.2867e+12 2.0059e+09                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model1)
## 
## Call:
## lm(formula = SalePrice ~ GarageArea + GrLivArea, data = num_house_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -160486  -22420    -474   19953  310231 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11867.111   4715.304  -2.517    0.012 *  
## GarageArea     148.184      7.171  20.665   <2e-16 ***
## GrLivArea       81.126      3.304  24.555   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44790 on 1140 degrees of freedom
## Multiple R-squared:  0.6224, Adjusted R-squared:  0.6218 
## F-statistic: 939.7 on 2 and 1140 DF,  p-value: < 2.2e-16
#The second model was created with variables showing a lower correlation to inspect impact on F and p values
model2 <- lm(SalePrice ~ BsmtFinSF1 + LotArea, data=num_house_data_train)
anova(model2)
## Analysis of Variance Table
## 
## Response: SalePrice
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## BsmtFinSF1    1 1.1493e+12 1.1493e+12 280.048 < 2.2e-16 ***
## LotArea       1 2.2917e+11 2.2917e+11  55.844 1.558e-13 ***
## Residuals  1140 4.6784e+12 4.1038e+09                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model2)
## 
## Call:
## lm(formula = SalePrice ~ BsmtFinSF1 + LotArea, data = num_house_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -290840  -44665  -13722   35159  310557 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.299e+05  3.474e+03  37.383  < 2e-16 ***
## BsmtFinSF1  6.730e+01  4.479e+00  15.024  < 2e-16 ***
## LotArea     1.858e+00  2.486e-01   7.473 1.56e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 64060 on 1140 degrees of freedom
## Multiple R-squared:  0.2276, Adjusted R-squared:  0.2262 
## F-statistic: 167.9 on 2 and 1140 DF,  p-value: < 2.2e-16
#The third model was created with four numeric variables reported by Sharma et al. (2024) as important features to inspect impact of using 4 variables on model performance
model3 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + BsmtFinSF1 + LotArea, data = num_house_data_train)
anova(model3)
## Analysis of Variance Table
## 
## Response: SalePrice
##               Df     Sum Sq    Mean Sq   F value  Pr(>F)    
## GrLivArea      1 2.9135e+12 2.9135e+12 1794.0267 < 2e-16 ***
## TotalBsmtSF    1 1.1063e+12 1.1063e+12  681.2235 < 2e-16 ***
## BsmtFinSF1     1 1.8359e+11 1.8359e+11  113.0472 < 2e-16 ***
## LotArea        1 5.3612e+09 5.3612e+09    3.3013 0.06949 .  
## Residuals   1138 1.8481e+12 1.6240e+09                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model3)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + BsmtFinSF1 + 
##     LotArea, data = num_house_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -176355  -20187    2056   22238  218094 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.299e+04  4.477e+03  -7.370 3.28e-13 ***
## GrLivArea    8.529e+01  2.899e+00  29.424  < 2e-16 ***
## TotalBsmtSF  6.427e+01  3.521e+00  18.254  < 2e-16 ***
## BsmtFinSF1   3.264e+01  3.147e+00  10.371  < 2e-16 ***
## LotArea      2.922e-01  1.608e-01   1.817   0.0695 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40300 on 1138 degrees of freedom
## Multiple R-squared:  0.6949, Adjusted R-squared:  0.6938 
## F-statistic: 647.9 on 4 and 1138 DF,  p-value: < 2.2e-16
#The fourth model was created with three numeric variables reported by Sharma et al. (2024) as important features to inspect impact of using 3 variables on model performance but eliminating Lot Area due to the low correlation
model4 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + GarageArea, data = num_house_data_train)
anova(model4)
## Analysis of Variance Table
## 
## Response: SalePrice
##               Df     Sum Sq    Mean Sq F value    Pr(>F)    
## GrLivArea      1 2.9135e+12 2.9135e+12 1924.83 < 2.2e-16 ***
## TotalBsmtSF    1 1.1063e+12 1.1063e+12  730.89 < 2.2e-16 ***
## GarageArea     1 3.1302e+11 3.1302e+11  206.80 < 2.2e-16 ***
## Residuals   1139 1.7240e+12 1.5136e+09                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model4)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + GarageArea, 
##     data = num_house_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -163581  -18869     782   20619  256040 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -38239.016   4318.333  -8.855   <2e-16 ***
## GrLivArea       70.505      2.922  24.127   <2e-16 ***
## TotalBsmtSF     63.264      3.281  19.282   <2e-16 ***
## GarageArea      97.289      6.765  14.381   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38910 on 1139 degrees of freedom
## Multiple R-squared:  0.7154, Adjusted R-squared:  0.7146 
## F-statistic: 954.2 on 3 and 1139 DF,  p-value: < 2.2e-16

To validate the model and reduce the risk of over-fitting, 10-fold cross validation was performed on the training dataset. 10-fold cross validation was chosen as the preferred cross-validation method due to the low computational cost while continuing to re-sample data in 10 iterations for model training.

#Define a set of 10 cross-validation folds
house_folds <- vfold_cv(num_house_data_train, v = 10)
house_folds
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [1028/115]> Fold01
##  2 <split [1028/115]> Fold02
##  3 <split [1028/115]> Fold03
##  4 <split [1029/114]> Fold04
##  5 <split [1029/114]> Fold05
##  6 <split [1029/114]> Fold06
##  7 <split [1029/114]> Fold07
##  8 <split [1029/114]> Fold08
##  9 <split [1029/114]> Fold09
## 10 <split [1029/114]> Fold10
multipleModel <- linear_reg(mode = "regression", engine = "lm")
#Define a workflow for model 4
house_workflow <- workflow() %>%
  add_model(multipleModel) %>%
  add_formula(SalePrice ~ GrLivArea + TotalBsmtSF + GarageArea)
house_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Formula
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## SalePrice ~ GrLivArea + TotalBsmtSF + GarageArea
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
#Fit model 4 to each cross-validation fold
house_train_fit <- 
  house_workflow %>%
  fit_resamples(house_folds)
house_train_fit
## # Resampling results
## # 10-fold cross-validation 
## # A tibble: 10 × 4
##    splits             id     .metrics         .notes          
##    <list>             <chr>  <list>           <list>          
##  1 <split [1028/115]> Fold01 <tibble [2 × 4]> <tibble [0 × 4]>
##  2 <split [1028/115]> Fold02 <tibble [2 × 4]> <tibble [0 × 4]>
##  3 <split [1028/115]> Fold03 <tibble [2 × 4]> <tibble [0 × 4]>
##  4 <split [1029/114]> Fold04 <tibble [2 × 4]> <tibble [0 × 4]>
##  5 <split [1029/114]> Fold05 <tibble [2 × 4]> <tibble [0 × 4]>
##  6 <split [1029/114]> Fold06 <tibble [2 × 4]> <tibble [0 × 4]>
##  7 <split [1029/114]> Fold07 <tibble [2 × 4]> <tibble [0 × 4]>
##  8 <split [1029/114]> Fold08 <tibble [2 × 4]> <tibble [0 × 4]>
##  9 <split [1029/114]> Fold09 <tibble [2 × 4]> <tibble [0 × 4]>
## 10 <split [1029/114]> Fold10 <tibble [2 × 4]> <tibble [0 × 4]>
#Calculate mean metrics
collect_metrics(house_train_fit)
## # A tibble: 2 × 6
##   .metric .estimator      mean     n   std_err .config        
##   <chr>   <chr>          <dbl> <int>     <dbl> <chr>          
## 1 rmse    standard   38789.       10 1553.     pre0_mod0_post0
## 2 rsq     standard       0.715    10    0.0197 pre0_mod0_post0
The purpose of this project was to compare various multiple regression models to determine the combination of variables with the strongest relationship with house sale price for future use in a prediction model. The approach taken in this project was to first identify variables previously reported in the literature of having a relationship with house prices. Once a list of variables was obtained (Sharma et al., 2024), the correlation coefficients were then obtained for this dataset to support identification of possible variables.  

The adjusted r-squared for the best performing model indicated that the variables of general living area, total basement square footage, and garage area accounted for 71.46% of the variance of the sale price. Use of these variables resulted in a model which performed moderately well, with a F-statistic of 954.2 and a p<0.00. Variables were chosen with a moderate correlation to the response variable of sale price but did not have high correlations with each other indicating each variable included in the model contributed to the variance in sale prices and added predictive value. Because a small number of variables were chosen for use in the model, many non-linear relationships were not included in this project.

Forys (2022) compared a multiple regression model with neural networks to compare model performance in house sale price predictions across model types. Forys (2022) explained multivariate regression models have a high risk of overfitting due to the addition of many variables leading to high variance. Forys compared neural networks with multivariate regression models, finding the benefit of analysis of non-linear relationships to increase model performance when using neural networks (Forys, 2022).

In this project, numerical variables supported feature selection in using correlational coefficients to identify variables of importance for use in multiple regression models. Models which incorporated variables with a weak relationship between house sale price and the value of the variable affected the adjusted r-squared and the F-statistic, while continuing to show a p-value < 0.00. This reflected the importance of inspecting more than the p-value and r-squared value in determining the performance of a model.

The limitations of this project included the small number of variables used in the four models developed and the inclusion of solely numeric variables. Further inspection of the dataset with incorporation of important categorical variables, such as quality rating, may improve the performance of the model and increase the predictive power. To reduce computational cost, a small number of variables were selected limiting the robustness of the data used. Increased model performance may be found when using variables not included in the four models developed for this project. Additionally, the residual degrees of freedom are high compared to the model degrees of freedom. This is attributed to the simplicity of the model and the small number of variables used. Although this reduced the computational cost, the low number of variables may risk underfitting. Future work will incorporate categorical data values and increase the number of variables to improve model performance.

Overall, the model developed for this project performed moderately well when combining four valuable variables in a multiple regression model to predict response variables of home sale price. This project demonstrates that a multiple regression model may be considered in the development of a model to predict the sale price of a house.

Works Cited

andradaolteanu. (2019, August 18). Housing Prices Competition - Iowa Dataset. Kaggle.com; Kaggle. https://www.kaggle.com/code/andradaolteanu/housing-prices-competition-iowa-dataset?select=test.csv

Forys, I. (2022). Machine learning in house price analysis: regression models versus neural networks. Procedia Computer Science, 207, 435–445. https://doi.org/10.1016/j.procs.2022.09.078

Sharma, H., Harsora, H., & Ogunleye, B. (2024). An Optimal House Price Prediction Algorithm: XGBoost. Analytics, 3(1), 30–45. https://doi.org/10.3390/analytics3010003

Sharma, S., Arora, D., Shankar, G., Sharma, P., & Motwani, V. (2023). House Price Prediction using Machine Learning Algorithm. Proceedings - 7th International Conference on Computing Methodologies and Communication, ICCMC 2023, 982– 986. https://doi.org/10.1109/ICCMC56507.2023.10084197

Zach. (2020, August 6). How to Remove Outliers in R. Statology. https://www.statology.org/remove-outliers-r/

citation("tidyverse")
## To cite package 'tidyverse' in publications use:
## 
##   Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R,
##   Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller
##   E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V,
##   Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to
##   the tidyverse." _Journal of Open Source Software_, *4*(43), 1686.
##   doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Welcome to the {tidyverse}},
##     author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
##     year = {2019},
##     journal = {Journal of Open Source Software},
##     volume = {4},
##     number = {43},
##     pages = {1686},
##     doi = {10.21105/joss.01686},
##   }
citation("dplyr")
## To cite package 'dplyr' in publications use:
## 
##   Wickham H, François R, Henry L, Müller K, Vaughan D (2023). _dplyr: A
##   Grammar of Data Manipulation_. doi:10.32614/CRAN.package.dplyr
##   <https://doi.org/10.32614/CRAN.package.dplyr>, R package version
##   1.1.4, <https://CRAN.R-project.org/package=dplyr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {dplyr: A Grammar of Data Manipulation},
##     author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan},
##     year = {2023},
##     note = {R package version 1.1.4},
##     url = {https://CRAN.R-project.org/package=dplyr},
##     doi = {10.32614/CRAN.package.dplyr},
##   }
citation("readr")
## To cite package 'readr' in publications use:
## 
##   Wickham H, Hester J, Bryan J (2024). _readr: Read Rectangular Text
##   Data_. doi:10.32614/CRAN.package.readr
##   <https://doi.org/10.32614/CRAN.package.readr>, R package version
##   2.1.5, <https://CRAN.R-project.org/package=readr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {readr: Read Rectangular Text Data},
##     author = {Hadley Wickham and Jim Hester and Jennifer Bryan},
##     year = {2024},
##     note = {R package version 2.1.5},
##     url = {https://CRAN.R-project.org/package=readr},
##     doi = {10.32614/CRAN.package.readr},
##   }
citation("boot")
## To cite the 'boot' package in publications use:
## 
##   Angelo Canty and Brian Ripley (2024). boot: Bootstrap R (S-Plus)
##   Functions. R package version 1.3-31.
## 
##   Davison, A. C. & Hinkley, D. V. (1997) Bootstrap Methods and Their
##   Applications. Cambridge University Press, Cambridge. ISBN
##   0-521-57391-2
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
citation("ggplot2")
## To cite ggplot2 in publications, please use
## 
##   H. Wickham. ggplot2: Elegant Graphics for Data Analysis.
##   Springer-Verlag New York, 2016.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     author = {Hadley Wickham},
##     title = {ggplot2: Elegant Graphics for Data Analysis},
##     publisher = {Springer-Verlag New York},
##     year = {2016},
##     isbn = {978-3-319-24277-4},
##     url = {https://ggplot2.tidyverse.org},
##   }
citation("moments")
## To cite package 'moments' in publications use:
## 
##   Komsta L, Novomestky F (2022). _moments: Moments, Cumulants,
##   Skewness, Kurtosis and Related Tests_.
##   doi:10.32614/CRAN.package.moments
##   <https://doi.org/10.32614/CRAN.package.moments>, R package version
##   0.14.1, <https://CRAN.R-project.org/package=moments>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {moments: Moments, Cumulants, Skewness, Kurtosis and Related Tests},
##     author = {Lukasz Komsta and Frederick Novomestky},
##     year = {2022},
##     note = {R package version 0.14.1},
##     url = {https://CRAN.R-project.org/package=moments},
##     doi = {10.32614/CRAN.package.moments},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
citation("scales")
## To cite package 'scales' in publications use:
## 
##   Wickham H, Pedersen T, Seidel D (2025). _scales: Scale Functions for
##   Visualization_. doi:10.32614/CRAN.package.scales
##   <https://doi.org/10.32614/CRAN.package.scales>, R package version
##   1.4.0, <https://CRAN.R-project.org/package=scales>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {scales: Scale Functions for Visualization},
##     author = {Hadley Wickham and Thomas Lin Pedersen and Dana Seidel},
##     year = {2025},
##     note = {R package version 1.4.0},
##     url = {https://CRAN.R-project.org/package=scales},
##     doi = {10.32614/CRAN.package.scales},
##   }
citation("kableExtra")
## To cite package 'kableExtra' in publications use:
## 
##   Zhu H (2024). _kableExtra: Construct Complex Table with 'kable' and
##   Pipe Syntax_. doi:10.32614/CRAN.package.kableExtra
##   <https://doi.org/10.32614/CRAN.package.kableExtra>, R package version
##   1.4.0, <https://CRAN.R-project.org/package=kableExtra>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {kableExtra: Construct Complex Table with 'kable' and Pipe Syntax},
##     author = {Hao Zhu},
##     year = {2024},
##     note = {R package version 1.4.0},
##     url = {https://CRAN.R-project.org/package=kableExtra},
##     doi = {10.32614/CRAN.package.kableExtra},
##   }
citation("rsample")
## To cite package 'rsample' in publications use:
## 
##   Frick H, Chow F, Kuhn M, Mahoney M, Silge J, Wickham H (2025).
##   _rsample: General Resampling Infrastructure_.
##   doi:10.32614/CRAN.package.rsample
##   <https://doi.org/10.32614/CRAN.package.rsample>, R package version
##   1.3.1, <https://CRAN.R-project.org/package=rsample>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {rsample: General Resampling Infrastructure},
##     author = {Hannah Frick and Fanny Chow and Max Kuhn and Michael Mahoney and Julia Silge and Hadley Wickham},
##     year = {2025},
##     note = {R package version 1.3.1},
##     url = {https://CRAN.R-project.org/package=rsample},
##     doi = {10.32614/CRAN.package.rsample},
##   }
citation("tidymodels")
## To cite package 'tidymodels' in publications use:
## 
##   Kuhn et al., (2020). Tidymodels: a collection of packages for
##   modeling and machine learning using tidyverse principles.
##   https://www.tidymodels.org
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.},
##     author = {Max Kuhn and Hadley Wickham},
##     url = {https://www.tidymodels.org},
##     year = {2020},
##   }
citation("parsnip")
## To cite package 'parsnip' in publications use:
## 
##   Kuhn M, Vaughan D (2025). _parsnip: A Common API to Modeling and
##   Analysis Functions_. doi:10.32614/CRAN.package.parsnip
##   <https://doi.org/10.32614/CRAN.package.parsnip>, R package version
##   1.4.0, <https://CRAN.R-project.org/package=parsnip>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {parsnip: A Common API to Modeling and Analysis Functions},
##     author = {Max Kuhn and Davis Vaughan},
##     year = {2025},
##     note = {R package version 1.4.0},
##     url = {https://CRAN.R-project.org/package=parsnip},
##     doi = {10.32614/CRAN.package.parsnip},
##   }
citation("caret")
## To cite caret in publications use:
## 
##   Kuhn, M. (2008). Building Predictive Models in R Using the caret
##   Package. Journal of Statistical Software, 28(5), 1–26.
##   https://doi.org/10.18637/jss.v028.i05
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Building Predictive Models in R Using the caret Package},
##     volume = {28},
##     url = {https://www.jstatsoft.org/index.php/jss/article/view/v028i05},
##     doi = {10.18637/jss.v028.i05},
##     number = {5},
##     journal = {Journal of Statistical Software},
##     author = {{Kuhn} and {Max}},
##     year = {2008},
##     pages = {1–26},
##   }
citation("yardstick")
## To cite package 'yardstick' in publications use:
## 
##   Kuhn M, Vaughan D, Hvitfeldt E (2025). _yardstick: Tidy
##   Characterizations of Model Performance_.
##   doi:10.32614/CRAN.package.yardstick
##   <https://doi.org/10.32614/CRAN.package.yardstick>, R package version
##   1.3.2, <https://CRAN.R-project.org/package=yardstick>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {yardstick: Tidy Characterizations of Model Performance},
##     author = {Max Kuhn and Davis Vaughan and Emil Hvitfeldt},
##     year = {2025},
##     note = {R package version 1.3.2},
##     url = {https://CRAN.R-project.org/package=yardstick},
##     doi = {10.32614/CRAN.package.yardstick},
##   }