Loading necessary Libraries:

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

Loading and reading the data in R:

The purpose of this project is to use different techniques in either R or Python programming language to analyse the relationship between the Sale Price of the houses in Ames, Iowa and different explanatory variables. Then, using the results of the analysis, predict the prices of the houses.

Let’s get started; first, we’ll load the data into R:

# Reading the data into R and assign it to Train_DataFrame
Train_DataFrame <- read.csv("https://raw.githubusercontent.com/SalouaDaouki/Data605/main/train.csv")

# Reading the test data into R
Test_DataFrame <- read.csv("https://raw.githubusercontent.com/SalouaDaouki/Data605/main/test.csv")

Exploring the Data:

Let’s look at the data and its structure and see if we need to perform any tidying:

# Cheching the first few rows of the data set
head(Train_DataFrame)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1       0      2   2008       WD        Normal    208500
## 2       0      5   2007       WD        Normal    181500
## 3       0      9   2008       WD        Normal    223500
## 4       0      2   2006       WD       Abnorml    140000
## 5       0     12   2008       WD        Normal    250000
## 6     700     10   2009       WD        Normal    143000
# Summary statistics of the dataset
summary(Train_DataFrame)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

Based on the statistical summary of the SalePrice variables, it appears that it is right skewed; as the mean (180,921) is greater than the median (163,000). We can visualize that better by the following:

plot(Train_DataFrame$SalePrice)

ggplot(data = Train_DataFrame, aes(x = SalePrice)) +
  geom_histogram(color = "white", fill = "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The majority of properties seem to have lower sale prices, as indicated by the clustering of points along the lower half of the graph. The histogram shows that the SalePrice is right skewed.

# Structure of the dataset
str(Train_DataFrame)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

The data have mixed of numerical and categorical variables, we may need to create subsets to separate both variables for the purpose of further analysis.

Tidying the data:

Train Data:

First, let’s identify any missing values:

# Check which variables have missing values
missing_values <- colSums(is.na(Train_DataFrame))
missing_variables <- names(missing_values[missing_values > 0])

# Create a data frame to store variable names and missing value counts
missing_info <- data.frame(Variable = missing_variables, Missing_Count = missing_values[missing_variables])

# Order the data frame by Missing_Count in descending order
missing_info <- missing_info[order(-missing_info$Missing_Count), ]

# Display the table
print(missing_info)
##                  Variable Missing_Count
## PoolQC             PoolQC          1453
## MiscFeature   MiscFeature          1406
## Alley               Alley          1369
## Fence               Fence          1179
## FireplaceQu   FireplaceQu           690
## LotFrontage   LotFrontage           259
## GarageType     GarageType            81
## GarageYrBlt   GarageYrBlt            81
## GarageFinish GarageFinish            81
## GarageQual     GarageQual            81
## GarageCond     GarageCond            81
## BsmtExposure BsmtExposure            38
## BsmtFinType2 BsmtFinType2            38
## BsmtQual         BsmtQual            37
## BsmtCond         BsmtCond            37
## BsmtFinType1 BsmtFinType1            37
## MasVnrType     MasVnrType             8
## MasVnrArea     MasVnrArea             8
## Electrical     Electrical             1

Let’s visualize the missing values to see which variables has the most:

# Create a bar plot
barplot(missing_info$Missing_Count, names.arg = missing_info$Variable, 
        xlab = "Variable", ylab = "Missing Count",
        col = "skyblue", main = "Count of Missing Data by Variable",
        las = 2, cex.names = 0.8)

After calculating the percents of the missing values of each variables, we can clearly be confident to remove the ones that have higher number of the missing values, since it is not going to help us with anything for further analysis.

# Calculate the total number of observations
numb_obs <- nrow(Train_DataFrame)

# Calculate the percentage of missing values for each variable
missing_info <- mutate(missing_info, missing_percentage = (Missing_Count / numb_obs) * 100)

# Select variables with missing values exceeding 80%
vars_to_remove <- missing_info %>%
  filter(missing_percentage > 80) %>%
  pull(Variable)

# Remove the selected variables from the data
Train_df_subset <- Train_DataFrame %>%
  select(-one_of(vars_to_remove))

Test Data:

Now let’s do the same thing for the test data, calculate the missing values and remove them from the data set.

# Check which variables have missing values
missing_valuesTest <- colSums(is.na(Test_DataFrame))
missing_variablesTest <- names(missing_valuesTest[missing_valuesTest > 0])

# Create a data frame to store variable names and missing value counts
missing_infoTest <- data.frame(Variable = missing_variablesTest, Missing_Count = missing_valuesTest[missing_variablesTest])

# Order the data frame by Missing_Count in descending order
missing_infoTest <- missing_infoTest[order(-missing_infoTest$Missing_Count), ]

# Display the table
print(missing_infoTest)
##                  Variable Missing_Count
## PoolQC             PoolQC          1456
## MiscFeature   MiscFeature          1408
## Alley               Alley          1352
## Fence               Fence          1169
## FireplaceQu   FireplaceQu           730
## LotFrontage   LotFrontage           227
## GarageYrBlt   GarageYrBlt            78
## GarageFinish GarageFinish            78
## GarageQual     GarageQual            78
## GarageCond     GarageCond            78
## GarageType     GarageType            76
## BsmtCond         BsmtCond            45
## BsmtQual         BsmtQual            44
## BsmtExposure BsmtExposure            44
## BsmtFinType1 BsmtFinType1            42
## BsmtFinType2 BsmtFinType2            42
## MasVnrType     MasVnrType            16
## MasVnrArea     MasVnrArea            15
## MSZoning         MSZoning             4
## Utilities       Utilities             2
## BsmtFullBath BsmtFullBath             2
## BsmtHalfBath BsmtHalfBath             2
## Functional     Functional             2
## Exterior1st   Exterior1st             1
## Exterior2nd   Exterior2nd             1
## BsmtFinSF1     BsmtFinSF1             1
## BsmtFinSF2     BsmtFinSF2             1
## BsmtUnfSF       BsmtUnfSF             1
## TotalBsmtSF   TotalBsmtSF             1
## KitchenQual   KitchenQual             1
## GarageCars     GarageCars             1
## GarageArea     GarageArea             1
## SaleType         SaleType             1

Let’s visualize the missing values to see which variables has the most:

# Create a bar plot
barplot(missing_infoTest$Missing_Count, names.arg = missing_infoTest$Variable, 
        xlab = "Variable", ylab = "Missing Count",
        col = "skyblue", main = "Count of Missing Data by Variable",
        las = 2, cex.names = 0.8)

# Calculate the total number of observations
numb_obsTest <- nrow(Test_DataFrame)

# Calculate the percentage of missing values for each variable
missing_infoTest <- mutate(missing_infoTest, missing_percentage = (Missing_Count / numb_obsTest) * 100)

# Select variables with missing values exceeding 80%
vars_to_removeTest <- missing_infoTest %>%
  filter(missing_percentage > 80) %>%
  pull(Variable)

# Remove the selected variables from the data
Test_df_subset <- Test_DataFrame %>%
  select(-one_of(vars_to_removeTest))

Handling the missing data:

# Create a MICE imputation model using pmm
set.seed(124)
mice_mod <- mice(Train_df_subset, method = 'pmm', m = 5)
## 
##  iter imp variable
##   1   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   1   2  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   1   3  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   1   4  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   1   5  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   2   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   2   2  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   2   3  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   2   4  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   2   5  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   3   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   3   2  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   3   3  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   3   4  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   3   5  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   4   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   4   2  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   4   3  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   4   4  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   4   5  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   5   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   5   2  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   5   3  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   5   4  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   5   5  LotFrontage*  MasVnrArea*  GarageYrBlt*
## Warning: Number of logged events: 189
# Complete the imputation process
imputed_data <- complete(mice_mod)
# Create a MICE imputation model using the Predictive mean matching
mice_modTest <- mice(Test_df_subset, method = 'pmm', m = 5)
## 
##  iter imp variable
##   1   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   1   2  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   1   3  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   1   4  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   1   5  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   2   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   2   2  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   2   3  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   2   4  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   2   5  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   3   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   3   2  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   3   3  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   3   4  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   3   5  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   4   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   4   2  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   4   3  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   4   4  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   4   5  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   5   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   5   2  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   5   3  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   5   4  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   5   5  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
## Warning: Number of logged events: 589
# Complete the imputation process
imputed_dataTest <- complete(mice_modTest)

After imputation of both data sets (train and test), let’s assess the quality of the imputation:

# Diagnostic plots
plot(mice_mod)

# Summary statistics
summary(mice_mod)
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##            ""            ""            ""         "pmm"            "" 
##        Street      LotShape   LandContour     Utilities     LotConfig 
##            ""            ""            ""            ""            "" 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##            ""            ""            ""            ""            "" 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##            ""            ""            ""            ""            "" 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##            ""            ""            ""            ""            "" 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##         "pmm"            ""            ""            ""            "" 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##            ""            ""            ""            ""            "" 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##            ""            ""            ""            ""            "" 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##            ""            ""            ""            ""            "" 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##            ""            ""            ""            ""            "" 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##            ""            ""            ""            ""            "" 
##    Fireplaces   FireplaceQu    GarageType   GarageYrBlt  GarageFinish 
##            ""            ""            ""         "pmm"            "" 
##    GarageCars    GarageArea    GarageQual    GarageCond    PavedDrive 
##            ""            ""            ""            ""            "" 
##    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch 
##            ""            ""            ""            ""            "" 
##      PoolArea       MiscVal        MoSold        YrSold      SaleType 
##            ""            ""            ""            ""            "" 
## SaleCondition     SalePrice 
##            ""            "" 
## PredictorMatrix:
##             Id MSSubClass MSZoning LotFrontage LotArea Street LotShape
## Id           0          1        0           1       1      0        0
## MSSubClass   1          0        0           1       1      0        0
## MSZoning     1          1        0           1       1      0        0
## LotFrontage  1          1        0           0       1      0        0
## LotArea      1          1        0           1       0      0        0
## Street       1          1        0           1       1      0        0
##             LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## Id                    0         0         0         0            0          0
## MSSubClass            0         0         0         0            0          0
## MSZoning              0         0         0         0            0          0
## LotFrontage           0         0         0         0            0          0
## LotArea               0         0         0         0            0          0
## Street                0         0         0         0            0          0
##             Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## Id                   0        0          0           1           1         1
## MSSubClass           0        0          0           1           1         1
## MSZoning             0        0          0           1           1         1
## LotFrontage          0        0          0           1           1         1
## LotArea              0        0          0           1           1         1
## Street               0        0          0           1           1         1
##             YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## Id                     1         0        0           0           0          0
## MSSubClass             1         0        0           0           0          0
## MSZoning               1         0        0           0           0          0
## LotFrontage            1         0        0           0           0          0
## LotArea                1         0        0           0           0          0
## Street                 1         0        0           0           0          0
##             MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## Id                   1         0         0          0        0        0
## MSSubClass           1         0         0          0        0        0
## MSZoning             1         0         0          0        0        0
## LotFrontage          1         0         0          0        0        0
## LotArea              1         0         0          0        0        0
## Street               1         0         0          0        0        0
##             BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## Id                     0            0          1            0          1
## MSSubClass             0            0          1            0          1
## MSZoning               0            0          1            0          1
## LotFrontage            0            0          1            0          1
## LotArea                0            0          1            0          1
## Street                 0            0          1            0          1
##             BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Id                  1           1       0         0          0          0
## MSSubClass          1           1       0         0          0          0
## MSZoning            1           1       0         0          0          0
## LotFrontage         1           1       0         0          0          0
## LotArea             1           1       0         0          0          0
## Street              1           1       0         0          0          0
##             X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Id                  1         1            1         1            1
## MSSubClass          1         1            1         1            1
## MSZoning            1         1            1         1            1
## LotFrontage         1         1            1         1            1
## LotArea             1         1            1         1            1
## Street              1         1            1         1            1
##             BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr
## Id                     1        1        1            1            1
## MSSubClass             1        1        1            1            1
## MSZoning               1        1        1            1            1
## LotFrontage            1        1        1            1            1
## LotArea                1        1        1            1            1
## Street                 1        1        1            1            1
##             KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## Id                    0            1          0          1           0
## MSSubClass            0            1          0          1           0
## MSZoning              0            1          0          1           0
## LotFrontage           0            1          0          1           0
## LotArea               0            1          0          1           0
## Street                0            1          0          1           0
##             GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## Id                   0           1            0          1          1
## MSSubClass           0           1            0          1          1
## MSZoning             0           1            0          1          1
## LotFrontage          0           1            0          1          1
## LotArea              0           1            0          1          1
## Street               0           1            0          1          1
##             GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Id                   0          0          0          1           1
## MSSubClass           0          0          0          1           1
## MSZoning             0          0          0          1           1
## LotFrontage          0          0          0          1           1
## LotArea              0          0          0          1           1
## Street               0          0          0          1           1
##             EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold
## Id                      1          1           1        1       1      1      1
## MSSubClass              1          1           1        1       1      1      1
## MSZoning                1          1           1        1       1      1      1
## LotFrontage             1          1           1        1       1      1      1
## LotArea                 1          1           1        1       1      1      1
## Street                  1          1           1        1       1      1      1
##             SaleType SaleCondition SalePrice
## Id                 0             0         1
## MSSubClass         0             0         1
## MSZoning           0             0         1
## LotFrontage        0             0         1
## LotArea            0             0         1
## Street             0             0         1
## Number of logged events:  189 
##   it im dep     meth         out
## 1  0  0     constant    MSZoning
## 2  0  0     constant      Street
## 3  0  0     constant    LotShape
## 4  0  0     constant LandContour
## 5  0  0     constant   Utilities
## 6  0  0     constant   LotConfig
# Diagnostic plots
plot(mice_modTest)

# Summary statistics
summary(mice_modTest)
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##            ""            ""            ""         "pmm"            "" 
##        Street      LotShape   LandContour     Utilities     LotConfig 
##            ""            ""            ""            ""            "" 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##            ""            ""            ""            ""            "" 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##            ""            ""            ""            ""            "" 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##            ""            ""            ""            ""            "" 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##         "pmm"            ""            ""            ""            "" 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##            ""            ""            ""         "pmm"            "" 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##         "pmm"         "pmm"         "pmm"            ""            "" 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##            ""            ""            ""            ""            "" 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##            ""         "pmm"         "pmm"            ""            "" 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##            ""            ""            ""            ""            "" 
##    Fireplaces   FireplaceQu    GarageType   GarageYrBlt  GarageFinish 
##            ""            ""            ""         "pmm"            "" 
##    GarageCars    GarageArea    GarageQual    GarageCond    PavedDrive 
##         "pmm"         "pmm"            ""            ""            "" 
##    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch 
##            ""            ""            ""            ""            "" 
##      PoolArea       MiscVal        MoSold        YrSold      SaleType 
##            ""            ""            ""            ""            "" 
## SaleCondition 
##            "" 
## PredictorMatrix:
##             Id MSSubClass MSZoning LotFrontage LotArea Street LotShape
## Id           0          1        0           1       1      0        0
## MSSubClass   1          0        0           1       1      0        0
## MSZoning     0          0        0           0       0      0        0
## LotFrontage  1          1        0           0       1      0        0
## LotArea      1          1        0           1       0      0        0
## Street       1          1        0           1       1      0        0
##             LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## Id                    0         0         0         0            0          0
## MSSubClass            0         0         0         0            0          0
## MSZoning              0         0         0         0            0          0
## LotFrontage           0         0         0         0            0          0
## LotArea               0         0         0         0            0          0
## Street                0         0         0         0            0          0
##             Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## Id                   0        0          0           1           1         1
## MSSubClass           0        0          0           1           1         1
## MSZoning             0        0          0           0           0         0
## LotFrontage          0        0          0           1           1         1
## LotArea              0        0          0           1           1         1
## Street               0        0          0           1           1         1
##             YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## Id                     1         0        0           0           0          0
## MSSubClass             1         0        0           0           0          0
## MSZoning               0         0        0           0           0          0
## LotFrontage            1         0        0           0           0          0
## LotArea                1         0        0           0           0          0
## Street                 1         0        0           0           0          0
##             MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## Id                   1         0         0          0        0        0
## MSSubClass           1         0         0          0        0        0
## MSZoning             0         0         0          0        0        0
## LotFrontage          1         0         0          0        0        0
## LotArea              1         0         0          0        0        0
## Street               1         0         0          0        0        0
##             BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## Id                     0            0          1            0          1
## MSSubClass             0            0          1            0          1
## MSZoning               0            0          0            0          0
## LotFrontage            0            0          1            0          1
## LotArea                0            0          1            0          1
## Street                 0            0          1            0          1
##             BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Id                  1           1       0         0          0          0
## MSSubClass          1           1       0         0          0          0
## MSZoning            0           0       0         0          0          0
## LotFrontage         1           1       0         0          0          0
## LotArea             1           1       0         0          0          0
## Street              1           1       0         0          0          0
##             X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Id                  1         1            1         1            1
## MSSubClass          1         1            1         1            1
## MSZoning            0         0            0         0            0
## LotFrontage         1         1            1         1            1
## LotArea             1         1            1         1            1
## Street              1         1            1         1            1
##             BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr
## Id                     1        1        1            1            1
## MSSubClass             1        1        1            1            1
## MSZoning               0        0        0            0            0
## LotFrontage            1        1        1            1            1
## LotArea                1        1        1            1            1
## Street                 1        1        1            1            1
##             KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## Id                    0            1          0          1           0
## MSSubClass            0            1          0          1           0
## MSZoning              0            0          0          0           0
## LotFrontage           0            1          0          1           0
## LotArea               0            1          0          1           0
## Street                0            1          0          1           0
##             GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## Id                   0           1            0          1          1
## MSSubClass           0           1            0          1          1
## MSZoning             0           0            0          0          0
## LotFrontage          0           1            0          1          1
## LotArea              0           1            0          1          1
## Street               0           1            0          1          1
##             GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Id                   0          0          0          1           1
## MSSubClass           0          0          0          1           1
## MSZoning             0          0          0          0           0
## LotFrontage          0          0          0          1           1
## LotArea              0          0          0          1           1
## Street               0          0          0          1           1
##             EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold
## Id                      1          1           1        1       1      1      1
## MSSubClass              1          1           1        1       1      1      1
## MSZoning                0          0           0        0       0      0      0
## LotFrontage             1          1           1        1       1      1      1
## LotArea                 1          1           1        1       1      1      1
## Street                  1          1           1        1       1      1      1
##             SaleType SaleCondition
## Id                 0             0
## MSSubClass         0             0
## MSZoning           0             0
## LotFrontage        0             0
## LotArea            0             0
## Street             0             0
## Number of logged events:  589 
##   it im dep     meth         out
## 1  0  0     constant    MSZoning
## 2  0  0     constant      Street
## 3  0  0     constant    LotShape
## 4  0  0     constant LandContour
## 5  0  0     constant   Utilities
## 6  0  0     constant   LotConfig

Now let’s take a look at the SalePrice on both datasets, Train_DataFrame; before tidying, and Train_df_subset; after tidying:

# histogram for SalePrice in the original data
hist(Train_DataFrame$SalePrice, main = "Train Data Frame before tidying", xlab = "Sale Price")

# Histogram for SalePrice in the subset
hist(Train_df_subset$SalePrice, main = "Train Subset after tidying", xlab = "Sale Price")

Before visualizing the numerical variables in the data, let’s check their skewness, then pick only the varaibles with positive skewness.

# Identify the numerical and categorical data variables
numerical_vars <- sapply(Train_df_subset, is.numeric)
Categorical_vars <- sapply(Train_df_subset, is.character)

# Subset the data to include only quantitative variables
numerical_data <- Train_df_subset[, numerical_vars]
Categorical_data <- Train_df_subset[, Categorical_vars]

# Calculate skewness for quantitative variables
skewness_values <- sapply(numerical_data, skewness)
head(skewness_values)
##          Id  MSSubClass LotFrontage     LotArea OverallQual OverallCond 
##   0.0000000   1.4047656          NA  12.1826150   0.2164984   0.6916440

EDA:

summary(numerical_data)
##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 59.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 69.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 70.05   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 80.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                   NA's   :259                      
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF       X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
## 
# Histogram for numerical data with right skewness
numerical_data %>%
  keep(skewness_values>2) %>%   
  gather() %>%                  
  ggplot(aes(value)) + 
  facet_wrap(~ key, scales = "free") +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Perform ANOVA for each categorical variable against SalePrice
anova_results <- lapply(Categorical_data, function(x) {
  aov_result <- aov(Train_df_subset$SalePrice ~ x)
  p_value <- summary(aov_result)[[1]][["Pr(>F)"]][1]
  return(p_value)
})

# Combine variable names and p-values into a data frame
anova_results_df <- data.frame(variable = names(anova_results),
                               p_value = unlist(anova_results))

# Filter variables with significant p-values (e.g., p < 0.05)
significant_vars <- anova_results_df$variable[anova_results_df$p_value < 0.05]

# Subset the categorical data to include only significant variables
significant_categorical_data <- Categorical_data[, significant_vars]
# Histogram for most significant categorical variables data with right skewness
significant_categorical_data %>%
  gather() %>%                  
  ggplot(aes(value)) + 
  facet_wrap(~ key, scales = "free") +
  geom_bar()

# Perform log transformation on LotArea and SalePrice
Train_df_subset$log_LotArea <- log(Train_df_subset$LotArea)
Train_df_subset$log_SalePrice <- log(Train_df_subset$SalePrice)

# Plot histograms of original and log-transformed variables
ggplot(Train_df_subset, aes(x = LotArea)) +
  geom_histogram(binwidth = 100) +
  labs(title = "Histogram of LotArea (Original)")

ggplot(Train_df_subset, aes(x = log_LotArea)) +
  geom_histogram(binwidth = 0.1) +
  labs(title = "Histogram of Log(LotArea)")

ggplot(Train_df_subset, aes(x = SalePrice)) +
  geom_histogram(binwidth = 10000) +
  labs(title = "Histogram of SalePrice (Original)")

ggplot(Train_df_subset, aes(x = log_SalePrice)) +
  geom_histogram(binwidth = 0.1) +
  labs(title = "Histogram of Log(SalePrice)")

As we can see that the original variables have right-skewed distribution, whereas the log-transformed variables have normal distribution

Linear Regression Models:

Now, in order to have a better model and a better predictions, let’s calculate the correlation between all numerical variables and Sale Price to identify potential relationships.

# Exclude SalePrice from numerical_data
numerical_data_without_saleprice <- numerical_data[, !colnames(numerical_data) %in% "SalePrice"]

# Calculate correlations between all numerical variables (excluding SalePrice) and SalePrice
correlation_with_saleprice <- cor(numerical_data_without_saleprice, Train_df_subset$SalePrice, use = "complete.obs")

# Find the variable with the strongest correlation with SalePrice
strongest_correlation <- names(which.max(abs(correlation_with_saleprice)))

# Print the correlation values
print(correlation_with_saleprice)
##                       [,1]
## Id            -0.047121850
## MSSubClass    -0.088031702
## LotFrontage    0.344269772
## LotArea        0.299962206
## OverallQual    0.797880680
## OverallCond   -0.124391232
## YearBuilt      0.525393598
## YearRemodAdd   0.521253270
## MasVnrArea     0.488658155
## BsmtFinSF1     0.390300523
## BsmtFinSF2    -0.028021366
## BsmtUnfSF      0.213128680
## TotalBsmtSF    0.615612237
## X1stFlrSF      0.607969106
## X2ndFlrSF      0.306879002
## LowQualFinSF  -0.001481983
## GrLivArea      0.705153567
## BsmtFullBath   0.236737407
## BsmtHalfBath  -0.036512665
## FullBath       0.566627442
## HalfBath       0.268560303
## BedroomAbvGr   0.166813894
## KitchenAbvGr  -0.140497445
## TotRmsAbvGrd   0.547067360
## Fireplaces     0.461872689
## GarageYrBlt    0.504753018
## GarageCars     0.647033611
## GarageArea     0.619329622
## WoodDeckSF     0.336855121
## OpenPorchSF    0.343353812
## EnclosedPorch -0.154843204
## X3SsnPorch     0.030776594
## ScreenPorch    0.110426815
## PoolArea       0.092488120
## MiscVal       -0.036041237
## MoSold         0.051568064
## YrSold        -0.011868823
# Print the variable with the strongest correlation
print(strongest_correlation)
## NULL

Based on the correlation calculated above, we can see that Sale Price has strong (absolute value is above 0.3) with the following variables: LotArea”, “OverallQual”, “YearBuilt”, “YearRemodAdd”, “TotalBsmtSF”, “X1stFlrSF”, “GrLivArea”, “GarageCars”, “GarageArea”

# Subset the numerical data to include variables with the strongest correlation
strongest_correlation_variables <- c("LotArea", "OverallQual", "YearBuilt", "YearRemodAdd", "TotalBsmtSF", "X1stFlrSF", "GrLivArea", "GarageCars", "GarageArea")
strongest_correlation_data <- numerical_data_without_saleprice[, strongest_correlation_variables]

# Compute the correlation matrix
correlation_matrix_strongest <- cor(strongest_correlation_data)

# Print the correlation matrix
print(correlation_matrix_strongest)
##                 LotArea OverallQual  YearBuilt YearRemodAdd TotalBsmtSF
## LotArea      1.00000000   0.1058057 0.01422765   0.01378843   0.2608331
## OverallQual  0.10580574   1.0000000 0.57232277   0.55068392   0.5378085
## YearBuilt    0.01422765   0.5723228 1.00000000   0.59285498   0.3914520
## YearRemodAdd 0.01378843   0.5506839 0.59285498   1.00000000   0.2910656
## TotalBsmtSF  0.26083313   0.5378085 0.39145200   0.29106558   1.0000000
## X1stFlrSF    0.29947458   0.4762238 0.28198586   0.24037927   0.8195300
## GrLivArea    0.26311617   0.5930074 0.19900971   0.28738852   0.4548682
## GarageCars   0.15487074   0.6006707 0.53785009   0.42062215   0.4345848
## GarageArea   0.18040276   0.5620218 0.47895382   0.37159981   0.4866655
##              X1stFlrSF GrLivArea GarageCars GarageArea
## LotArea      0.2994746 0.2631162  0.1548707  0.1804028
## OverallQual  0.4762238 0.5930074  0.6006707  0.5620218
## YearBuilt    0.2819859 0.1990097  0.5378501  0.4789538
## YearRemodAdd 0.2403793 0.2873885  0.4206222  0.3715998
## TotalBsmtSF  0.8195300 0.4548682  0.4345848  0.4866655
## X1stFlrSF    1.0000000 0.5660240  0.4393168  0.4897817
## GrLivArea    0.5660240 1.0000000  0.4672474  0.4689975
## GarageCars   0.4393168 0.4672474  1.0000000  0.8824754
## GarageArea   0.4897817 0.4689975  0.8824754  1.0000000
# Fit linear model with variables having abs(correlation) >= 0.3
strong_correlation_variables <- names(which(abs(correlation_matrix_strongest) >= 0.3 & row(correlation_matrix_strongest) != col(correlation_matrix_strongest)))

# Add SalePrice to strongest_correlation_data
strongest_correlation_data <- cbind(strongest_correlation_data, SalePrice = Train_df_subset$SalePrice)

# Fit linear model with variables having the strongest correlation
lm_model <- lm(SalePrice ~ ., data = strongest_correlation_data)
# Print the summary of the linear model
summary(lm_model)
## 
## Call:
## lm(formula = SalePrice ~ ., data = strongest_correlation_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -489636  -19500   -1834   14990  295150 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.125e+06  1.204e+05  -9.344  < 2e-16 ***
## LotArea       6.386e-01  1.051e-01   6.076 1.57e-09 ***
## OverallQual   2.017e+04  1.180e+03  17.092  < 2e-16 ***
## YearBuilt     2.411e+02  4.741e+01   5.086 4.14e-07 ***
## YearRemodAdd  2.897e+02  6.260e+01   4.628 4.01e-06 ***
## TotalBsmtSF   1.911e+01  4.226e+00   4.522 6.62e-06 ***
## X1stFlrSF     1.132e+01  4.885e+00   2.317  0.02063 *  
## GrLivArea     4.515e+01  2.707e+00  16.677  < 2e-16 ***
## GarageCars    9.335e+03  2.986e+03   3.126  0.00181 ** 
## GarageArea    1.629e+01  1.012e+01   1.609  0.10774    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37520 on 1450 degrees of freedom
## Multiple R-squared:  0.7783, Adjusted R-squared:  0.7769 
## F-statistic: 565.7 on 9 and 1450 DF,  p-value: < 2.2e-16

We can see that the p-value of GarageArea is 0.10774, which is greater than 0.05 To improve the linear model above, let’s “backward_eliminate” the least significant variables:

# Fit initial model without X1stFlrSF
lm_model_initial <- lm(SalePrice ~ . - GarageArea, data = strongest_correlation_data)

# Assess model fit
summary(lm_model_initial)
## 
## Call:
## lm(formula = SalePrice ~ . - GarageArea, data = strongest_correlation_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -480779  -19678   -1749   15393  294396 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.121e+06  1.205e+05  -9.306  < 2e-16 ***
## LotArea       6.424e-01  1.051e-01   6.111 1.27e-09 ***
## OverallQual   2.015e+04  1.180e+03  17.067  < 2e-16 ***
## YearBuilt     2.406e+02  4.744e+01   5.073 4.43e-07 ***
## YearRemodAdd  2.879e+02  6.262e+01   4.597 4.66e-06 ***
## TotalBsmtSF   1.976e+01  4.210e+00   4.694 2.93e-06 ***
## X1stFlrSF     1.179e+01  4.879e+00   2.417   0.0158 *  
## GrLivArea     4.531e+01  2.707e+00  16.736  < 2e-16 ***
## GarageCars    1.317e+04  1.800e+03   7.316 4.22e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37540 on 1451 degrees of freedom
## Multiple R-squared:  0.7779, Adjusted R-squared:  0.7767 
## F-statistic: 635.4 on 8 and 1451 DF,  p-value: < 2.2e-16

Removing GarageArea from the model has a very small impact on the model’s fit. Let’s remove the X1stFlrSF and see the impact:

# Fit initial model without X1stFlrSF
lm_model_initial1 <- lm(SalePrice ~ . - X1stFlrSF, data = strongest_correlation_data)

# Assess model fit
summary(lm_model_initial1)
## 
## Call:
## lm(formula = SalePrice ~ . - X1stFlrSF, data = strongest_correlation_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -495369  -19133   -2244   15155  294325 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.109e+06  1.204e+05  -9.214  < 2e-16 ***
## LotArea       6.568e-01  1.050e-01   6.258 5.12e-10 ***
## OverallQual   1.989e+04  1.176e+03  16.920  < 2e-16 ***
## YearBuilt     2.351e+02  4.741e+01   4.958 7.98e-07 ***
## YearRemodAdd  2.894e+02  6.269e+01   4.616 4.25e-06 ***
## TotalBsmtSF   2.627e+01  2.890e+00   9.090  < 2e-16 ***
## GrLivArea     4.721e+01  2.561e+00  18.431  < 2e-16 ***
## GarageCars    9.453e+03  2.991e+03   3.161   0.0016 ** 
## GarageArea    1.770e+01  1.012e+01   1.749   0.0805 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37580 on 1451 degrees of freedom
## Multiple R-squared:  0.7775, Adjusted R-squared:  0.7763 
## F-statistic: 633.8 on 8 and 1451 DF,  p-value: < 2.2e-16

Backward-eliminating the X1stFlrSF, has a little larger impact on the model’s fit, we can see that by comparing the adjusted R-squared on the three models; it was 0.7769, then it decreased to 0.7797 (very slight difference) then it decreased to 0.7763. So let’s perform residual analysis on the initial model (before removing any of the variables)

Residual Analysis:

plot(fitted(lm_model), residuals(lm_model),
     xlab = "Fitted values", ylab = "Residuals",
     main = "Residuals vs Fitted")
abline(h = 0, col = "red")

The points are not randomly scattered around 0, so the linear model might not be suitable for the data. let’s confirm that with normal Q-Q plot:

qqnorm(residuals(lm_model))
qqline(residuals(lm_model))

Less variance in the data, this suggest that it doesn’t follow normal distribution perfectly. For further investigation of the linear model, let’s plot cook’s distance to identify the values which have more influence than others on the estimated coefficients.

# Predicted values from the linear regression model
predicted <- predict(lm_model)

# Plot Response vs. Fitted values
plot(predicted, strongest_correlation_data$SalePrice,
     xlab = "Fitted (Predicted) Values",
     ylab = "Actual Sale Price",
     main = "Response vs. Fitted (Predicted) Values")

# Add a reference line
abline(0, 1, col = "red")

The y-intercept of the abline is close to zero, but not zero, so we may need more data processing then we will fit the model again. One of the options we can do is log-transformations of the predictors.

# Log-transform strongly correlated variables in the entire dataframe
strongest_correlation_data <- strongest_correlation_data %>%
  mutate(log_LotArea = log(LotArea),
         log_OverallQual = log(OverallQual),
         log_YearBuilt = log(YearBuilt),
         log_YearRemodAdd = log(YearRemodAdd),
         log_TotalBsmtSF = log(TotalBsmtSF),
         log_X1stFlrSF = log(X1stFlrSF),
         log_GrLivArea = log(GrLivArea))

Now let’s fit the linear model on the log-transformed predictors:

# Check for zero or negative values in the original variables
sum(strongest_correlation_data$TotalBsmtSF <= 0)
## [1] 37
sum(strongest_correlation_data$GrLivArea <= 0)
## [1] 0

After checking for zero or negative values, the variables TotalBsmtSF has 37 values that are \(\leq0\), let’s add a small constant value to it to avoid any issu with log transformation:

# Add a small constant value to avoid zero or negative values
epsilon <- 1e-6  # Small constant
strongest_correlation_data$log_TotalBsmtSF <- log(strongest_correlation_data$TotalBsmtSF + epsilon)

Let’s check again for zero and/or negative values:

# Check for zero or negative values in the original variables
sum(strongest_correlation_data$Log_TotalBsmtSF <= 0)
## [1] 0
# Fit linear model on log-transformed variables
lm_model_log <- lm(SalePrice ~ log_LotArea + log_OverallQual + log_YearBuilt + log_YearRemodAdd +
                      log_TotalBsmtSF + log_X1stFlrSF + log_GrLivArea, data = strongest_correlation_data)

# Print summary of the linear model
summary(lm_model_log)
## 
## Call:
## lm(formula = SalePrice ~ log_LotArea + log_OverallQual + log_YearBuilt + 
##     log_YearRemodAdd + log_TotalBsmtSF + log_X1stFlrSF + log_GrLivArea, 
##     data = strongest_correlation_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -296387  -22172   -3695   15019  378800 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.374e+07  9.862e+05 -13.930  < 2e-16 ***
## log_LotArea       2.216e+04  2.467e+03   8.984  < 2e-16 ***
## log_OverallQual   1.012e+05  7.127e+03  14.193  < 2e-16 ***
## log_YearBuilt     8.804e+05  9.606e+04   9.165  < 2e-16 ***
## log_YearRemodAdd  7.997e+05  1.372e+05   5.829 6.86e-09 ***
## log_TotalBsmtSF   6.252e+02  3.495e+02   1.789   0.0738 .  
## log_X1stFlrSF     3.894e+04  4.487e+03   8.679  < 2e-16 ***
## log_GrLivArea     7.002e+04  4.679e+03  14.964  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41730 on 1452 degrees of freedom
## Multiple R-squared:  0.7254, Adjusted R-squared:  0.7241 
## F-statistic: 547.9 on 7 and 1452 DF,  p-value: < 2.2e-16

Let’s remove the log_TotalBsmtSF since it has the highest p-value (\(0.0738 > 0.05\))

# Fit linear model on log-transformed variables
lm_model_log1 <- lm(SalePrice ~ log_LotArea + log_OverallQual + log_YearBuilt + log_YearRemodAdd +
                       log_X1stFlrSF + log_GrLivArea, data = strongest_correlation_data)

# Print summary of the linear model
summary(lm_model_log1)
## 
## Call:
## lm(formula = SalePrice ~ log_LotArea + log_OverallQual + log_YearBuilt + 
##     log_YearRemodAdd + log_X1stFlrSF + log_GrLivArea, data = strongest_correlation_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -295942  -22115   -3957   15135  378681 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -13716909     986854 -13.900  < 2e-16 ***
## log_LotArea          22234       2469   9.006  < 2e-16 ***
## log_OverallQual     104217       6924  15.052  < 2e-16 ***
## log_YearBuilt       868123      95889   9.053  < 2e-16 ***
## log_YearRemodAdd    809329     137187   5.899 4.53e-09 ***
## log_X1stFlrSF        39152       4489   8.722  < 2e-16 ***
## log_GrLivArea        69350       4668  14.858  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41760 on 1453 degrees of freedom
## Multiple R-squared:  0.7248, Adjusted R-squared:  0.7236 
## F-statistic: 637.7 on 6 and 1453 DF,  p-value: < 2.2e-16

Let’s do residual analysis and see if there is any improvments:

Log Residual Analysis:

plot(fitted(lm_model_log1), residuals(lm_model_log1),
     xlab = "Fitted values", ylab = "Residuals",
     main = "Residuals vs Fitted")
abline(h = 0, col = "red")

Even after the log transformation, the residual plot shows that the linear model is not better model for the data; it shows a scatter plot that forms an upward curve, which indicates that a polynomial regression model might be more appropriate.

Polynomial Regression Model:

# Fit a polynomial regression model
poly_model <- lm(SalePrice ~ poly(log_LotArea, degree = 2) + 
                           poly(log_OverallQual, degree = 2) + 
                           poly(log_YearBuilt, degree = 2) + 
                           poly(log_YearRemodAdd, degree = 2) + 
                           poly(log_X1stFlrSF, degree = 2) + 
                           poly(log_GrLivArea, degree = 2), 
                 data = strongest_correlation_data)

# Print summary of the polynomial regression model
summary(poly_model)
## 
## Call:
## lm(formula = SalePrice ~ poly(log_LotArea, degree = 2) + poly(log_OverallQual, 
##     degree = 2) + poly(log_YearBuilt, degree = 2) + poly(log_YearRemodAdd, 
##     degree = 2) + poly(log_X1stFlrSF, degree = 2) + poly(log_GrLivArea, 
##     degree = 2), data = strongest_correlation_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -430487  -17419      41   15975  278478 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          180921.2      955.5 189.338  < 2e-16 ***
## poly(log_LotArea, degree = 2)1       376822.9    43564.3   8.650  < 2e-16 ***
## poly(log_LotArea, degree = 2)2        76127.1    39018.1   1.951 0.051241 .  
## poly(log_OverallQual, degree = 2)1  1161292.8    59583.4  19.490  < 2e-16 ***
## poly(log_OverallQual, degree = 2)2   501306.7    44565.2  11.249  < 2e-16 ***
## poly(log_YearBuilt, degree = 2)1     436176.6    55388.6   7.875 6.65e-15 ***
## poly(log_YearBuilt, degree = 2)2     -89905.9    50126.0  -1.794 0.073086 .  
## poly(log_YearRemodAdd, degree = 2)1  288994.8    52800.0   5.473 5.20e-08 ***
## poly(log_YearRemodAdd, degree = 2)2  159474.0    40914.2   3.898 0.000102 ***
## poly(log_X1stFlrSF, degree = 2)1     442762.2    48407.2   9.147  < 2e-16 ***
## poly(log_X1stFlrSF, degree = 2)2     -74080.8    43827.9  -1.690 0.091193 .  
## poly(log_GrLivArea, degree = 2)1     806661.4    53667.2  15.031  < 2e-16 ***
## poly(log_GrLivArea, degree = 2)2     454717.3    43148.5  10.538  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36510 on 1447 degrees of freedom
## Multiple R-squared:  0.7905, Adjusted R-squared:  0.7888 
## F-statistic:   455 on 12 and 1447 DF,  p-value: < 2.2e-16

Let’s remove the predictors that have higher p-value (>0.05); log_X1stFlrSF and log_YearBuilt and see if there is any improvement in the model:

# Fit a polynomial regression model
poly_model1 <- lm(SalePrice ~ poly(log_LotArea, degree = 2) + 
                           poly(log_OverallQual, degree = 2) + 
                           poly(log_YearRemodAdd, degree = 2) + 
                           poly(log_GrLivArea, degree = 2), 
                 data = strongest_correlation_data)

# Print summary of the polynomial regression model
summary(poly_model1)
## 
## Call:
## lm(formula = SalePrice ~ poly(log_LotArea, degree = 2) + poly(log_OverallQual, 
##     degree = 2) + poly(log_YearRemodAdd, degree = 2) + poly(log_GrLivArea, 
##     degree = 2), data = strongest_correlation_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -402637  -19644    -318   18278  291902 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           180921       1020 177.414  < 2e-16 ***
## poly(log_LotArea, degree = 2)1        541973      42668  12.702  < 2e-16 ***
## poly(log_LotArea, degree = 2)2         50852      39410   1.290    0.197    
## poly(log_OverallQual, degree = 2)1   1433359      55972  25.609  < 2e-16 ***
## poly(log_OverallQual, degree = 2)2    606174      44284  13.688  < 2e-16 ***
## poly(log_YearRemodAdd, degree = 2)1   433681      47064   9.215  < 2e-16 ***
## poly(log_YearRemodAdd, degree = 2)2    52160      40332   1.293    0.196    
## poly(log_GrLivArea, degree = 2)1      831695      52975  15.700  < 2e-16 ***
## poly(log_GrLivArea, degree = 2)2      352859      42857   8.233 4.01e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38970 on 1451 degrees of freedom
## Multiple R-squared:  0.7607, Adjusted R-squared:  0.7594 
## F-statistic: 576.7 on 8 and 1451 DF,  p-value: < 2.2e-16
# Fit a polynomial regression model
poly_model2 <- lm(SalePrice ~poly(log_OverallQual, degree = 2) + 
                           poly(log_YearBuilt, degree = 2) + 
                           poly(log_X1stFlrSF, degree = 2) + 
                           poly(log_GrLivArea, degree = 2), 
                 data = strongest_correlation_data)

# Print summary of the polynomial regression model
summary(poly_model2)
## 
## Call:
## lm(formula = SalePrice ~ poly(log_OverallQual, degree = 2) + 
##     poly(log_YearBuilt, degree = 2) + poly(log_X1stFlrSF, degree = 2) + 
##     poly(log_GrLivArea, degree = 2), data = strongest_correlation_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -408809  -18322    -457   16576  270578 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         180921.2      993.7 182.062   <2e-16 ***
## poly(log_OverallQual, degree = 2)1 1186683.5    61507.6  19.293   <2e-16 ***
## poly(log_OverallQual, degree = 2)2  524430.8    45549.9  11.513   <2e-16 ***
## poly(log_YearBuilt, degree = 2)1    537237.2    49726.7  10.804   <2e-16 ***
## poly(log_YearBuilt, degree = 2)2     15367.3    44359.3   0.346   0.7291    
## poly(log_X1stFlrSF, degree = 2)1    584356.1    47366.3  12.337   <2e-16 ***
## poly(log_X1stFlrSF, degree = 2)2    -91936.2    42986.0  -2.139   0.0326 *  
## poly(log_GrLivArea, degree = 2)1    893221.3    53662.9  16.645   <2e-16 ***
## poly(log_GrLivArea, degree = 2)2    492443.7    44408.4  11.089   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37970 on 1451 degrees of freedom
## Multiple R-squared:  0.7728, Adjusted R-squared:  0.7716 
## F-statistic: 616.9 on 8 and 1451 DF,  p-value: < 2.2e-16

Based on the adjusted R-squared the original model using all predictors is better; around 79% of the data is explained by those predictors. let’s confirm that by plotting the residual

# Extract residuals
residuals <- residuals(poly_model)

# Plot residuals vs. fitted values
plot(fitted(poly_model), residuals, 
     xlab = "Fitted Values", ylab = "Residuals", 
     main = "Residuals vs. Fitted", 
     pch = 20, col = "blue")
abline(h = 0, col = "red")

qqnorm(residuals, main = "QQ Plot of Residuals")
qqline(residuals)

so let’s use it to predict the sale prices of the houses.

# Create a data frame with ID numbers starting at 1461
Predictions_df <- data.frame(Id = 1461:(1460 + nrow(strongest_correlation_data)))

# Predict sale prices using the fitted polynomial regression model
Predictions_df$Predicted_SalePrice <- predict(poly_model, newdata = strongest_correlation_data)

# Print the first few rows of the new data frame
head(Predictions_df)
##     Id Predicted_SalePrice
## 1 1461            215078.8
## 2 1462            162700.1
## 3 1463            226393.9
## 4 1464            169629.1
## 5 1465            286207.5
## 6 1466            147025.6

Now let’s write the Predictions_df as .csv file and save it on the directory:

# Write Predictions data to a CSV file
write.csv(Predictions_df, "predicted_sale_prices.csv", row.names = FALSE)

Conclusion:

In conclusion, let’s take a look at the most improtant varaibles

# variable importance
importance <- varImp(poly_model, scale = FALSE)
importance
##                                       Overall
## poly(log_LotArea, degree = 2)1       8.649806
## poly(log_LotArea, degree = 2)2       1.951072
## poly(log_OverallQual, degree = 2)1  19.490211
## poly(log_OverallQual, degree = 2)2  11.248829
## poly(log_YearBuilt, degree = 2)1     7.874847
## poly(log_YearBuilt, degree = 2)2     1.793597
## poly(log_YearRemodAdd, degree = 2)1  5.473382
## poly(log_YearRemodAdd, degree = 2)2  3.897771
## poly(log_X1stFlrSF, degree = 2)1     9.146617
## poly(log_X1stFlrSF, degree = 2)2     1.690264
## poly(log_GrLivArea, degree = 2)1    15.030809
## poly(log_GrLivArea, degree = 2)2    10.538427
# Arrange the dataframe by the Importance column in descending order
importance_ordered <- importance %>%
  arrange(desc(importance))

# Print the ordered variable importance dataframe
print(importance_ordered)
##                                       Overall
## poly(log_OverallQual, degree = 2)1  19.490211
## poly(log_GrLivArea, degree = 2)1    15.030809
## poly(log_OverallQual, degree = 2)2  11.248829
## poly(log_GrLivArea, degree = 2)2    10.538427
## poly(log_X1stFlrSF, degree = 2)1     9.146617
## poly(log_LotArea, degree = 2)1       8.649806
## poly(log_YearBuilt, degree = 2)1     7.874847
## poly(log_YearRemodAdd, degree = 2)1  5.473382
## poly(log_YearRemodAdd, degree = 2)2  3.897771
## poly(log_LotArea, degree = 2)2       1.951072
## poly(log_YearBuilt, degree = 2)2     1.793597
## poly(log_X1stFlrSF, degree = 2)2     1.690264

We can see that OverallQual of the houses is more important in affecting the sale prices, then followed by GrLivArea.The least important is X1stFlrSF.