First, Let’s load the dataset.

library(mlr)
library(ggplot2)
train <- read.csv("train.csv", stringsAsFactors=TRUE)
test <- read.csv("test.csv", stringsAsFactors=FALSE)

Ordering the features alphabetically. This helps me to find and compare them easily.

train1 <- train[,order(colnames(train))]

Check out the structure of the train1 dataset:

str(train1)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Alley        : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
##  $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2   : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Exterior1st  : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ Exterior2nd  : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ Fence        : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
##  $ FireplaceQu  : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageType   : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ LandContour  : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ LandSlope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ LotConfig    : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotShape     : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ MasVnrType   : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MiscFeature  : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ PavedDrive   : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  $ SaleType     : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Street       : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...

Let’s see the summary.

summarizeColumns(train1)[,c(1:3,8:10)]
##             name    type   na   min    max nlevs
## 1          Alley  factor 1369    41     50     2
## 2   BedroomAbvGr integer    0     0      8     0
## 3       BldgType  factor    0    31   1220     5
## 4       BsmtCond  factor   37     2   1311     4
## 5   BsmtExposure  factor   38   114    953     4
## 6     BsmtFinSF1 integer    0     0   5644     0
## 7     BsmtFinSF2 integer    0     0   1474     0
## 8   BsmtFinType1  factor   37    74    430     6
## 9   BsmtFinType2  factor   38    14   1256     6
## 10  BsmtFullBath integer    0     0      3     0
## 11  BsmtHalfBath integer    0     0      2     0
## 12      BsmtQual  factor   37    35    649     4
## 13     BsmtUnfSF integer    0     0   2336     0
## 14    CentralAir  factor    0    95   1365     2
## 15    Condition1  factor    0     2   1260     9
## 16    Condition2  factor    0     1   1445     8
## 17    Electrical  factor    1     1   1334     5
## 18 EnclosedPorch integer    0     0    552     0
## 19     ExterCond  factor    0     1   1282     5
## 20   Exterior1st  factor    0     1    515    15
## 21   Exterior2nd  factor    0     1    504    16
## 22     ExterQual  factor    0    14    906     4
## 23         Fence  factor 1179    11    157     4
## 24   FireplaceQu  factor  690    20    380     5
## 25    Fireplaces integer    0     0      3     0
## 26    Foundation  factor    0     3    647     6
## 27      FullBath integer    0     0      3     0
## 28    Functional  factor    0     1   1360     7
## 29    GarageArea integer    0     0   1418     0
## 30    GarageCars integer    0     0      4     0
## 31    GarageCond  factor   81     2   1326     5
## 32  GarageFinish  factor   81   352    605     3
## 33    GarageQual  factor   81     3   1311     5
## 34    GarageType  factor   81     6    870     6
## 35   GarageYrBlt integer   81  1900   2010     0
## 36     GrLivArea integer    0   334   5642     0
## 37      HalfBath integer    0     0      2     0
## 38       Heating  factor    0     1   1428     6
## 39     HeatingQC  factor    0     1    741     5
## 40    HouseStyle  factor    0     8    726     8
## 41            Id integer    0     1   1460     0
## 42  KitchenAbvGr integer    0     0      3     0
## 43   KitchenQual  factor    0    39    735     4
## 44   LandContour  factor    0    36   1311     4
## 45     LandSlope  factor    0    13   1382     3
## 46       LotArea integer    0  1300 215245     0
## 47     LotConfig  factor    0     4   1052     5
## 48   LotFrontage integer  259    21    313     0
## 49      LotShape  factor    0    10    925     4
## 50  LowQualFinSF integer    0     0    572     0
## 51    MasVnrArea integer    8     0   1600     0
## 52    MasVnrType  factor    8    15    864     4
## 53   MiscFeature  factor 1406     1     49     4
## 54       MiscVal integer    0     0  15500     0
## 55        MoSold integer    0     1     12     0
## 56    MSSubClass integer    0    20    190     0
## 57      MSZoning  factor    0    10   1151     5
## 58  Neighborhood  factor    0     2    225    25
## 59   OpenPorchSF integer    0     0    547     0
## 60   OverallCond integer    0     1      9     0
## 61   OverallQual integer    0     1     10     0
## 62    PavedDrive  factor    0    30   1340     3
## 63      PoolArea integer    0     0    738     0
## 64        PoolQC  factor 1453     2      3     3
## 65      RoofMatl  factor    0     1   1434     8
## 66     RoofStyle  factor    0     2   1141     6
## 67 SaleCondition  factor    0     4   1198     6
## 68     SalePrice integer    0 34900 755000     0
## 69      SaleType  factor    0     2   1267     9
## 70   ScreenPorch integer    0     0    480     0
## 71        Street  factor    0     6   1454     2
## 72   TotalBsmtSF integer    0     0   6110     0
## 73  TotRmsAbvGrd integer    0     2     14     0
## 74     Utilities  factor    0     1   1459     2
## 75    WoodDeckSF integer    0     0    857     0
## 76     X1stFlrSF integer    0   334   4692     0
## 77     X2ndFlrSF integer    0     0   2065     0
## 78    X3SsnPorch integer    0     0    508     0
## 79     YearBuilt integer    0  1872   2010     0
## 80  YearRemodAdd integer    0  1950   2010     0
## 81        YrSold integer    0  2006   2010     0

Let’s have some visualization.

library(ggplot2)
train1 <- train1[order(train1$SalePrice), ]
variables <- names(train1)
variables <- variables[! variables %in% c("SalePrice","Id")]

for(variable in variables)
{
  if(is.factor(train1[[variable]]))
  {#x = reorder(factor(train1[[variable]]), SalePrice, FUN=median)
    
    print(paste(":::",variable,":::"))
    print(table(train1[[variable]], useNA="ifany"))
    p <- #ggplot(train1, aes(factor(train1[[variable]]), SalePrice))
      ggplot(train1, aes(reorder(factor(train1[[variable]]), SalePrice, FUN=median), SalePrice))+ 
      geom_boxplot(aes(fill = factor(train1[[variable]])), outlier.colour = "orange", outlier.size = 3)  + labs(x = variable,y = "Sale Price") +
      guides(fill=guide_legend(title=NULL)) +
      stat_summary(fun.y="mean", geom="point", shape=23, size=2, fill="white")
    print(p)
    
  }
  else
  {
    correlation <- cor(train1[[variable]],train1$SalePrice,use="na.or.complete")
    plot_title <- paste0(variable,"\nCorrelation = ",correlation)
    s <- ggplot(train1, aes(x=train1[[variable]], y=train1$SalePrice)) +
      geom_point(shape=1, color = "orange") +    
      geom_smooth(method=lm, color = "yellow") + 
      guides(fill=guide_legend(title=NULL)) +
      labs(x = variable, y = "Sale Price")
      
    
print(s)
    
  }
}
## [1] "::: Alley :::"
## 
## Grvl Pave <NA> 
##   50   41 1369

## [1] "::: BldgType :::"
## 
##   1Fam 2fmCon Duplex  Twnhs TwnhsE 
##   1220     31     52     43    114

## [1] "::: BsmtCond :::"
## 
##   Fa   Gd   Po   TA <NA> 
##   45   65    2 1311   37

## [1] "::: BsmtExposure :::"
## 
##   Av   Gd   Mn   No <NA> 
##  221  134  114  953   38

## [1] "::: BsmtFinType1 :::"
## 
##  ALQ  BLQ  GLQ  LwQ  Rec  Unf <NA> 
##  220  148  418   74  133  430   37

## [1] "::: BsmtFinType2 :::"
## 
##  ALQ  BLQ  GLQ  LwQ  Rec  Unf <NA> 
##   19   33   14   46   54 1256   38

## [1] "::: BsmtQual :::"
## 
##   Ex   Fa   Gd   TA <NA> 
##  121   35  618  649   37

## [1] "::: CentralAir :::"
## 
##    N    Y 
##   95 1365

## [1] "::: Condition1 :::"
## 
## Artery  Feedr   Norm   PosA   PosN   RRAe   RRAn   RRNe   RRNn 
##     48     81   1260      8     19     11     26      2      5

## [1] "::: Condition2 :::"
## 
## Artery  Feedr   Norm   PosA   PosN   RRAe   RRAn   RRNn 
##      2      6   1445      1      2      1      1      2

## [1] "::: Electrical :::"
## 
## FuseA FuseF FuseP   Mix SBrkr  <NA> 
##    94    27     3     1  1334     1

## [1] "::: ExterCond :::"
## 
##   Ex   Fa   Gd   Po   TA 
##    3   28  146    1 1282

## [1] "::: Exterior1st :::"
## 
## AsbShng AsphShn BrkComm BrkFace  CBlock CemntBd HdBoard ImStucc MetalSd 
##      20       1       2      50       1      61     222       1     220 
## Plywood   Stone  Stucco VinylSd Wd Sdng WdShing 
##     108       2      25     515     206      26

## [1] "::: Exterior2nd :::"
## 
## AsbShng AsphShn Brk Cmn BrkFace  CBlock CmentBd HdBoard ImStucc MetalSd 
##      20       3       7      25       1      60     207      10     214 
##   Other Plywood   Stone  Stucco VinylSd Wd Sdng Wd Shng 
##       1     142       5      26     504     197      38

## [1] "::: ExterQual :::"
## 
##  Ex  Fa  Gd  TA 
##  52  14 488 906

## [1] "::: Fence :::"
## 
## GdPrv  GdWo MnPrv  MnWw  <NA> 
##    59    54   157    11  1179

## [1] "::: FireplaceQu :::"
## 
##   Ex   Fa   Gd   Po   TA <NA> 
##   24   33  380   20  313  690

## [1] "::: Foundation :::"
## 
## BrkTil CBlock  PConc   Slab  Stone   Wood 
##    146    634    647     24      6      3

## [1] "::: Functional :::"
## 
## Maj1 Maj2 Min1 Min2  Mod  Sev  Typ 
##   14    5   31   34   15    1 1360

## [1] "::: GarageCond :::"
## 
##   Ex   Fa   Gd   Po   TA <NA> 
##    2   35    9    7 1326   81

## [1] "::: GarageFinish :::"
## 
##  Fin  RFn  Unf <NA> 
##  352  422  605   81

## [1] "::: GarageQual :::"
## 
##   Ex   Fa   Gd   Po   TA <NA> 
##    3   48   14    3 1311   81

## [1] "::: GarageType :::"
## 
##  2Types  Attchd Basment BuiltIn CarPort  Detchd    <NA> 
##       6     870      19      88       9     387      81

## Warning: Removed 81 rows containing non-finite values (stat_smooth).
## Warning: Removed 81 rows containing missing values (geom_point).

## [1] "::: Heating :::"
## 
## Floor  GasA  GasW  Grav  OthW  Wall 
##     1  1428    18     7     2     4

## [1] "::: HeatingQC :::"
## 
##  Ex  Fa  Gd  Po  TA 
## 741  49 241   1 428

## [1] "::: HouseStyle :::"
## 
## 1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer   SLvl 
##    154     14    726      8     11    445     37     65

## [1] "::: KitchenQual :::"
## 
##  Ex  Fa  Gd  TA 
## 100  39 586 735

## [1] "::: LandContour :::"
## 
##  Bnk  HLS  Low  Lvl 
##   63   50   36 1311

## [1] "::: LandSlope :::"
## 
##  Gtl  Mod  Sev 
## 1382   65   13

## [1] "::: LotConfig :::"
## 
##  Corner CulDSac     FR2     FR3  Inside 
##     263      94      47       4    1052

## Warning: Removed 259 rows containing non-finite values (stat_smooth).
## Warning: Removed 259 rows containing missing values (geom_point).

## [1] "::: LotShape :::"
## 
## IR1 IR2 IR3 Reg 
## 484  41  10 925

## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).

## [1] "::: MasVnrType :::"
## 
##  BrkCmn BrkFace    None   Stone    <NA> 
##      15     445     864     128       8

## [1] "::: MiscFeature :::"
## 
## Gar2 Othr Shed TenC <NA> 
##    2    2   49    1 1406

## [1] "::: MSZoning :::"
## 
## C (all)      FV      RH      RL      RM 
##      10      65      16    1151     218

## [1] "::: Neighborhood :::"
## 
## Blmngtn Blueste  BrDale BrkSide ClearCr CollgCr Crawfor Edwards Gilbert 
##      17       2      16      58      28     150      51     100      79 
##  IDOTRR MeadowV Mitchel   NAmes NoRidge NPkVill NridgHt  NWAmes OldTown 
##      37      17      49     225      41       9      77      73     113 
##  Sawyer SawyerW Somerst StoneBr   SWISU  Timber Veenker 
##      74      59      86      25      25      38      11

## [1] "::: PavedDrive :::"
## 
##    N    P    Y 
##   90   30 1340

## [1] "::: PoolQC :::"
## 
##   Ex   Fa   Gd <NA> 
##    2    2    3 1453

## [1] "::: RoofMatl :::"
## 
## ClyTile CompShg Membran   Metal    Roll Tar&Grv WdShake WdShngl 
##       1    1434       1       1       1      11       5       6

## [1] "::: RoofStyle :::"
## 
##    Flat   Gable Gambrel     Hip Mansard    Shed 
##      13    1141      11     286       7       2

## [1] "::: SaleCondition :::"
## 
## Abnorml AdjLand  Alloca  Family  Normal Partial 
##     101       4      12      20    1198     125

## [1] "::: SaleType :::"
## 
##   COD   Con ConLD ConLI ConLw   CWD   New   Oth    WD 
##    43     2     9     5     5     4   122     3  1267

## [1] "::: Street :::"
## 
## Grvl Pave 
##    6 1454

## [1] "::: Utilities :::"
## 
## AllPub NoSeWa 
##   1459      1