Model Development to Predict Used Home Prices Using Multiple Regression


Step 1: Acquire and Load the Data


##########################################################
# Load the data
# set current directory
setwd("/Users/whinton/src/rstudio/tim8521")

train_data <- read.csv("train.csv", header = TRUE, sep= ",", stringsAsFactors = TRUE)
test_data <- read.csv("test.csv", header = TRUE, sep= ",",stringsAsFactors = TRUE)
##########################################################

Step 2: Preview and Explore the Data


###########################################################
# Exploration of the Data and Correlation Analysis  
###########################################################
# Preview data
str(train_data)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Alley        : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
##  $ LotShape     : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
##  $ LandContour  : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotConfig    : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LandSlope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2   : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior1st  : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ Exterior2nd  : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
##  $ MasVnrType   : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
##  $ GarageType   : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ PavedDrive   : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
##  $ Fence        : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
##  $ MiscFeature  : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
summary(train_data)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   RH     :  16   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                  Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour  Utilities   
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63    AllPub:1459  
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50    NoSeWa:   1  
##  Median :  9478               NA's:1369   IR3: 10   Low:  36                 
##  Mean   : 10517                           Reg:925   Lvl:1311                 
##  3rd Qu.: 11602                                                              
##  Max.   :215245                                                              
##                                                                              
##    LotConfig    LandSlope   Neighborhood   Condition1     Condition2  
##  Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260   Norm   :1445  
##  CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81   Feedr  :   6  
##  FR2    :  47   Sev:  13   OldTown:113   Artery :  48   Artery :   2  
##  FR3    :   4              Edwards:100   RRAn   :  26   PosN   :   2  
##  Inside :1052              Somerst: 86   PosN   :  19   RRNn   :   2  
##                            Gilbert: 79   RRAe   :  11   PosA   :   1  
##                            (Other):707   (Other):  15   (Other):   2  
##    BldgType      HouseStyle   OverallQual      OverallCond      YearBuilt   
##  1Fam  :1220   1Story :726   Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  2fmCon:  31   2Story :445   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Duplex:  52   1.5Fin :154   Median : 6.000   Median :5.000   Median :1973  
##  Twnhs :  43   SLvl   : 65   Mean   : 6.099   Mean   :5.575   Mean   :1971  
##  TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                1.5Unf : 14   Max.   :10.000   Max.   :9.000   Max.   :2010  
##                (Other): 19                                                  
##   YearRemodAdd    RoofStyle       RoofMatl     Exterior1st   Exterior2nd 
##  Min.   :1950   Flat   :  13   CompShg:1434   VinylSd:515   VinylSd:504  
##  1st Qu.:1967   Gable  :1141   Tar&Grv:  11   HdBoard:222   MetalSd:214  
##  Median :1994   Gambrel:  11   WdShngl:   6   MetalSd:220   HdBoard:207  
##  Mean   :1985   Hip    : 286   WdShake:   5   Wd Sdng:206   Wd Sdng:197  
##  3rd Qu.:2004   Mansard:   7   ClyTile:   1   Plywood:108   Plywood:142  
##  Max.   :2010   Shed   :   2   Membran:   1   CemntBd: 61   CmentBd: 60  
##                                (Other):   2   (Other):128   (Other):136  
##    MasVnrType    MasVnrArea     ExterQual ExterCond  Foundation  BsmtQual  
##  BrkCmn : 15   Min.   :   0.0   Ex: 52    Ex:   3   BrkTil:146   Ex  :121  
##  BrkFace:445   1st Qu.:   0.0   Fa: 14    Fa:  28   CBlock:634   Fa  : 35  
##  None   :864   Median :   0.0   Gd:488    Gd: 146   PConc :647   Gd  :618  
##  Stone  :128   Mean   : 103.7   TA:906    Po:   1   Slab  : 24   TA  :649  
##  NA's   :  8   3rd Qu.: 166.0             TA:1282   Stone :  6   NA's: 37  
##                Max.   :1600.0                       Wood  :  3             
##                NA's   :8                                                   
##  BsmtCond    BsmtExposure BsmtFinType1   BsmtFinSF1     BsmtFinType2
##  Fa  :  45   Av  :221     ALQ :220     Min.   :   0.0   ALQ :  19   
##  Gd  :  65   Gd  :134     BLQ :148     1st Qu.:   0.0   BLQ :  33   
##  Po  :   2   Mn  :114     GLQ :418     Median : 383.5   GLQ :  14   
##  TA  :1311   No  :953     LwQ : 74     Mean   : 443.6   LwQ :  46   
##  NA's:  37   NA's: 38     Rec :133     3rd Qu.: 712.2   Rec :  54   
##                           Unf :430     Max.   :5644.0   Unf :1256   
##                           NA's: 37                      NA's:  38   
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF      Heating     HeatingQC
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Floor:   1   Ex:741   
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428   Fa: 49   
##  Median :   0.00   Median : 477.5   Median : 991.5   GasW :  18   Gd:241   
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4   Grav :   7   Po:  1   
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2   TA:428   
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0   Wall :   4            
##                                                                            
##  CentralAir Electrical     X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  N:  95     FuseA:  94   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  Y:1365     FuseF:  27   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##             FuseP:   3   Median :1087   Median :   0   Median :  0.000  
##             Mix  :   1   Mean   :1163   Mean   : 347   Mean   :  5.845  
##             SBrkr:1334   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##             NA's :   1   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                         
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual  TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex:100      Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa: 39      1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Gd:586      Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   TA:735      Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000               3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000               Max.   :14.000  
##                                                                               
##  Functional    Fireplaces    FireplaceQu   GarageType   GarageYrBlt  
##  Maj1:  14   Min.   :0.000   Ex  : 24    2Types :  6   Min.   :1900  
##  Maj2:   5   1st Qu.:0.000   Fa  : 33    Attchd :870   1st Qu.:1961  
##  Min1:  31   Median :1.000   Gd  :380    Basment: 19   Median :1980  
##  Min2:  34   Mean   :0.613   Po  : 20    BuiltIn: 88   Mean   :1979  
##  Mod :  15   3rd Qu.:1.000   TA  :313    CarPort:  9   3rd Qu.:2002  
##  Sev :   1   Max.   :3.000   NA's:690    Detchd :387   Max.   :2010  
##  Typ :1360                               NA's   : 81   NA's   :81    
##  GarageFinish   GarageCars      GarageArea     GarageQual  GarageCond 
##  Fin :352     Min.   :0.000   Min.   :   0.0   Ex  :   3   Ex  :   2  
##  RFn :422     1st Qu.:1.000   1st Qu.: 334.5   Fa  :  48   Fa  :  35  
##  Unf :605     Median :2.000   Median : 480.0   Gd  :  14   Gd  :   9  
##  NA's: 81     Mean   :1.767   Mean   : 473.0   Po  :   3   Po  :   7  
##               3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1311   TA  :1326  
##               Max.   :4.000   Max.   :1418.0   NA's:  81   NA's:  81  
##                                                                       
##  PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Y:1340     Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##             Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##             3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##             Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                                
##   ScreenPorch        PoolArea        PoolQC       Fence      MiscFeature
##  Min.   :  0.00   Min.   :  0.000   Ex  :   2   GdPrv:  59   Gar2:   2  
##  1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2   GdWo :  54   Othr:   2  
##  Median :  0.00   Median :  0.000   Gd  :   3   MnPrv: 157   Shed:  49  
##  Mean   : 15.06   Mean   :  2.759   NA's:1453   MnWw :  11   TenC:   1  
##  3rd Qu.:  0.00   3rd Qu.:  0.000               NA's :1179   NA's:1406  
##  Max.   :480.00   Max.   :738.000                                       
##                                                                         
##     MiscVal             MoSold           YrSold        SaleType   
##  Min.   :    0.00   Min.   : 1.000   Min.   :2006   WD     :1267  
##  1st Qu.:    0.00   1st Qu.: 5.000   1st Qu.:2007   New    : 122  
##  Median :    0.00   Median : 6.000   Median :2008   COD    :  43  
##  Mean   :   43.49   Mean   : 6.322   Mean   :2008   ConLD  :   9  
##  3rd Qu.:    0.00   3rd Qu.: 8.000   3rd Qu.:2009   ConLI  :   5  
##  Max.   :15500.00   Max.   :12.000   Max.   :2010   ConLw  :   5  
##                                                     (Other):   9  
##  SaleCondition    SalePrice     
##  Abnorml: 101   Min.   : 34900  
##  AdjLand:   4   1st Qu.:129975  
##  Alloca :  12   Median :163000  
##  Family :  20   Mean   :180921  
##  Normal :1198   3rd Qu.:214000  
##  Partial: 125   Max.   :755000  
## 

Step 3: Preprocess Data


##########################################################
# Preprocess Data
##########################################################


# Check for Erroneous Data Before Handling Missing Values
# List of numeric columns where negative could be invalid
invalid_negative_columns <- c("LotFrontage", "LotArea", "YearBuilt", "YearRemodAdd",
                              "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", 
                              "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", 
                              "GrLivArea", "GarageYrBlt", "GarageArea", "WoodDeckSF", 
                              "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", 
                              "PoolArea", "MiscVal", "SalePrice")

# Detect erroneous data in train and test datasets
erroneous_train <- detect_erroneous_data(train_data, invalid_negative_columns)
erroneous_test <- detect_erroneous_data(test_data, invalid_negative_columns)

# Print detected erroneous data
if (length(erroneous_train) > 0) {
  cat("\n### Erroneous Data Found in Train Dataset ###\n")
  print(erroneous_train)
} else {
  cat("\nNo erroneous values found in Train Dataset.\n")
}
## 
## No erroneous values found in Train Dataset.
if (length(erroneous_test) > 0) {
  cat("\n### Erroneous Data Found in Test Dataset ###\n")
  print(erroneous_test)
} else {
  cat("\nNo erroneous values found in Test Dataset.\n")
}
## 
## No erroneous values found in Test Dataset.
# Identify columns with missing values
missing_values <- colSums(is.na(train_data))
missing_cols <- names(missing_values[missing_values > 0])
missing_cols_train <- missing_cols
print(missing_cols)
##  [1] "LotFrontage"  "Alley"        "MasVnrType"   "MasVnrArea"   "BsmtQual"    
##  [6] "BsmtCond"     "BsmtExposure" "BsmtFinType1" "BsmtFinType2" "Electrical"  
## [11] "FireplaceQu"  "GarageType"   "GarageYrBlt"  "GarageFinish" "GarageQual"  
## [16] "GarageCond"   "PoolQC"       "Fence"        "MiscFeature"
# Imputation of missing values
for (col in missing_cols) {
  if (is.numeric(train_data[[col]])) {
    train_data[[col]][is.na(train_data[[col]])] <- mean(train_data[[col]], na.rm = TRUE)
  } else {
    train_data[[col]][is.na(train_data[[col]])] <- as.character(names(sort(table(train_data[[col]]), decreasing = TRUE)[1]))
  }
}

# Apply the same imputation for test_data
missing_vals_test <- colSums(is.na(test_data))
missing_cols_test <- names(missing_vals_test[missing_vals_test > 0])


for (col in missing_cols_test) {
  if (col %in% missing_cols_train) {
    if (is.numeric(test_data[[col]])) {
      test_data[[col]][is.na(test_data[[col]])] <- mean(train_data[[col]], na.rm = TRUE)  # Use train_data mean
    } else {
      test_data[[col]][is.na(test_data[[col]])] <- as.character(names(sort(table(train_data[[col]]), decreasing = TRUE)[1]))  # Use train_data mode
    }
  }
} 

# Show Missingness Map After Imputation
missmap(train_data)

# Encode categorical variables
train_data <- train_data %>% mutate_if(is.factor, as.numeric)

Step 4: Build initial Multiple Regression Model


##########################################################
# Build Initial Multiple Regression Model (Top 5 Features)
##########################################################

# Select only numeric features
numeric_features <- select_if(train_data, is.numeric)

# Compute correlation matrix
cor_matrix <- cor(numeric_features, use = "complete.obs")

# Identify the top 5 features most correlated with SalePrice (including SalePrice itself)
cor_target <- sort(cor_matrix["SalePrice",], decreasing = TRUE)
top_features <- names(head(cor_target, 5))  # Select top  predictors + SalePrice

# Subset correlation matrix to include only top 40 correlated features
cor_top_5 <- cor_matrix[top_features, top_features]

# Generate the correlation heatmap for the top 5 correlated features
pheatmap(cor_top_5, 
         display_numbers = TRUE, 
         clustering_distance_rows = "euclidean",
         clustering_distance_cols = "euclidean",
         clustering_method = "complete",
         color = colorRampPalette(c("blue", "white", "red"))(200),
         main = "Top 5 Features Correlated with SalePrice")

# Select only the top 5 highly correlated features, including SalePrice
train_data_top5 <- train_data[, c(top_features)]  

# Construct formula using only these features
formula_top5 <- as.formula(paste("SalePrice ~", paste(setdiff(top_features, "SalePrice"), collapse = " + ")))

# Build multiple regression model
lm_model_top5 <- lm(formula_top5, data = train_data_top5)

# Print model summary
summary(lm_model_top5)
## 
## Call:
## lm(formula = formula_top5, data = train_data_top5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -372594  -21236   -1594   18625  301129 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -98436.050   4820.467 -20.420  < 2e-16 ***
## OverallQual  26988.854   1067.393  25.285  < 2e-16 ***
## GrLivArea       49.573      2.555  19.402  < 2e-16 ***
## GarageCars   11317.522   3126.297   3.620 0.000305 ***
## GarageArea      41.478     10.627   3.903 9.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40420 on 1455 degrees of freedom
## Multiple R-squared:  0.7418, Adjusted R-squared:  0.7411 
## F-statistic:  1045 on 4 and 1455 DF,  p-value: < 2.2e-16
# Validate Model Using Cross-Validation
set.seed(123)
cv_control <- trainControl(method = "cv", number = 10)
cv_model_top5 <- train(formula_top5, data = train_data_top5, method = "lm", trControl = cv_control, metric = "RMSE")

# Print cross-validation results
print(cv_model_top5$results)
##   intercept     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1      TRUE 40239.58 0.7447829 27143.76 6808.893 0.06748995 2476.471
# Check Residuals After Optimization
par(mfrow = c(2,2))  # Arrange plots in a 2x2 grid
plot(lm_model_top5)  # Residual plots


Step 5: Determine Outliers for Feature Removal


# Features with a high number of outliers may be a candidate for removal

##########################################################
# Print outliers for each of the top correlated features
outlier_list <- list()
for (feature in top_features[-1]) {  # Exclude SalePrice
  outlier_data <- detect_outliers(train_data, feature)
  outlier_list[[feature]] <- outlier_data
  if (nrow(outlier_data) > 0) {
    print(paste("Outliers detected in:", feature))
    print(outlier_data)
  }
}
## [1] "Outliers detected in: OverallQual"
##    Id OverallQual
## 1 376           1
## 2 534           1
## [1] "Outliers detected in: GrLivArea"
##      Id GrLivArea
## 1    59      2945
## 2   119      3222
## 3   186      3608
## 4   198      3112
## 5   232      2794
## 6   305      3493
## 7   325      2978
## 8   497      3228
## 9   524      4676
## 10  584      2775
## 11  609      3194
## 12  636      3395
## 13  692      4316
## 14  770      3279
## 15  799      3140
## 16  804      2822
## 17  962      2872
## 18 1025      2898
## 19 1032      3082
## 20 1047      2868
## 21 1143      2828
## 22 1170      3627
## 23 1174      3086
## 24 1176      2872
## 25 1183      4476
## 26 1269      3447
## 27 1299      5642
## 28 1313      2810
## 29 1329      2792
## 30 1354      3238
## 31 1387      2784
## [1] "Outliers detected in: GarageCars"
##     Id GarageCars
## 1  421          4
## 2  748          4
## 3 1191          4
## 4 1341          4
## 5 1351          4
## [1] "Outliers detected in: GarageArea"
##      Id GarageArea
## 1   179       1166
## 2   225        968
## 3   271       1053
## 4   318       1025
## 5   409        947
## 6   582       1390
## 7   665       1134
## 8   719        983
## 9   804       1020
## 10  826       1220
## 11 1062       1248
## 12 1088       1043
## 13 1143       1052
## 14 1185        995
## 15 1191       1356
## 16 1229       1052
## 17 1242        954
## 18 1269       1014
## 19 1299       1418
## 20 1351        968
## 21 1418       1069
##########################################################
# Visualize Outliers Using Boxplots
##########################################################

# Generate boxplots for each of the top features
boxplot_list <- list()
for (feature in top_features[-1]) {
  p <- ggplot(train_data, aes_string(x = "1", y = feature)) +
    geom_boxplot(fill = "lightblue", outlier.color = "red", outlier.shape = 16) +
    labs(title = paste(" ", feature), x = "", y = feature) +
    theme_minimal()
  boxplot_list[[feature]] <- p
}

# Display boxplots in a grid format
grid.arrange(grobs = boxplot_list, ncol = 4)


Step 6: Optimize with Feature Selection. Rebuild Model


##########################################################
# Correlation Analysis with Selecte Features
##########################################################

# Convert categorical variables to numeric
train_data$Neighborhood <- as.numeric(factor(train_data$Neighborhood))
train_data$ExterQual <- as.numeric(factor(train_data$ExterQual))
train_data$BsmtQual <- as.numeric(factor(train_data$BsmtQual))

# Select numeric features for correlation analysis
numeric_features <- select_if(train_data, is.numeric)

# Compute correlation matrix
cor_matrix <- cor(numeric_features, use = "complete.obs")

# Identify top 10 features most correlated with SalePrice
cor_target <- sort(cor_matrix["SalePrice",], decreasing = TRUE)
top_features <- names(head(cor_target, 10))

# Manually add additional recommended features
additional_features <- c("Neighborhood", "LotFrontage", "ExterQual", "BsmtQual")

# Ensure the additional features exist in train_data
additional_features <- additional_features[additional_features %in% names(train_data)] 

# Remove high-outlier features before finalizing features list (Exclude SalePrice)
features_to_remove <- c("GrLivArea", "GarageArea")
final_features <- setdiff(unique(c(top_features, additional_features, "SalePrice")), features_to_remove)

# Subset correlation matrix for selected features
cor_selected <- cor_matrix[final_features, final_features]

# Generate correlation heatmap
pheatmap(cor_selected, 
         display_numbers = TRUE, 
         clustering_distance_rows = "euclidean",
         clustering_distance_cols = "euclidean",
         clustering_method = "complete",
         color = colorRampPalette(c("blue", "white", "red"))(200),
         main = "Correlation Matrix for Selected Features")

##########################################################
# Optimize and Rebuild Model (Removing High VIF Features)
##########################################################

# Step 1: Subset Data with Selected Features
train_data_top <- train_data[, final_features]  # Ensure SalePrice is included

# Step 2: Identify High VIF Features
lm_model_top <- lm(SalePrice ~ ., data = train_data_top)  
vif_values <- vif(lm_model_top)  

# Identify the top features with the highest VIF
top_vif_features <- names(vif_values[vif_values > 5])

# Step 3: Remove High VIF Features (Excluding SalePrice)
features_to_remove_vif <- setdiff(top_vif_features, "SalePrice")
train_data_filtered <- train_data_top %>% dplyr::select(-all_of(features_to_remove_vif))

# Step 4: Rebuild Model with Optimized Features
optimized_features <- setdiff(names(train_data_filtered), "SalePrice")
formula_optimized <- as.formula(paste("SalePrice ~", paste(optimized_features, collapse = " + ")))
lm_model_optimized <- lm(formula_optimized, data = train_data_filtered)

# Step 5: Print Model Summary After Optimization
summary(lm_model_optimized)
## 
## Call:
## lm(formula = formula_optimized, data = train_data_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -405663  -19851   -1088   15772  339206 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.789e+05  9.645e+04  -1.855  0.06379 .  
## OverallQual   2.003e+04  1.208e+03  16.585  < 2e-16 ***
## GarageCars    1.352e+04  1.857e+03   7.282 5.39e-13 ***
## TotalBsmtSF   1.429e+01  4.363e+00   3.274  0.00108 ** 
## X1stFlrSF     2.605e+01  4.925e+00   5.288 1.43e-07 ***
## FullBath      1.365e+03  2.606e+03   0.524  0.60061    
## TotRmsAbvGrd  8.805e+03  8.464e+02  10.402  < 2e-16 ***
## YearBuilt     9.432e+01  4.839e+01   1.949  0.05150 .  
## Neighborhood  1.440e+02  1.744e+02   0.826  0.40913    
## LotFrontage   1.090e+02  5.141e+01   2.121  0.03409 *  
## ExterQual    -1.386e+04  2.015e+03  -6.878 9.00e-12 ***
## BsmtQual     -1.144e+04  1.607e+03  -7.116 1.75e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38360 on 1448 degrees of freedom
## Multiple R-squared:  0.7686, Adjusted R-squared:  0.7669 
## F-statistic: 437.3 on 11 and 1448 DF,  p-value: < 2.2e-16
# Step 6: Validate with Cross-Validation
set.seed(123)
cv_control <- trainControl(method = "cv", number = 10)
cv_model_optimized <- train(formula_optimized, data = train_data_filtered, method = "lm", trControl = cv_control, metric = "RMSE")

# Print updated cross-validation results
print(cv_model_optimized$results)
##   intercept     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1      TRUE 38334.16 0.7688749 25291.06 8515.063 0.08354885 2874.598
# Step 7: Check Residuals After Optimization
par(mfrow = c(2,2))
plot(lm_model_optimized)


This study performed by Will Hinton