# House Price Analysis..
# Loading Required Libraries..
libraries <- c('ggplot2','dplyr','e1071','corrplot','gridExtra',
               'caret','data.table','testthat','randomForest')
installlib <- libraries[!libraries %in% installed.packages()]
for (libs in installlib) install.packages(libs, dependencies = TRUE)
sapply(libraries, require, character = TRUE)
## Loading required package: ggplot2
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: e1071
## Loading required package: corrplot
## corrplot 0.84 loaded
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Loading required package: caret
## Loading required package: lattice
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## Loading required package: testthat
## 
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
## 
##     matches
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
##      ggplot2        dplyr        e1071     corrplot    gridExtra        caret 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##   data.table     testthat randomForest 
##         TRUE         TRUE         TRUE
# Reading the Data into R..
train <- fread("C:/Users/Rewati/Documents/R/train.csv", stringsAsFactors = FALSE)
test <- fread("C:/Users/Rewati/Documents/R/test.csv", stringsAsFactors = FALSE)
View(train)
View(test)
head(train)
##    Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1:  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2:  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3:  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4:  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5:  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6:  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##    Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1:    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2:    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3:    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4:    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5:    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6:    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##    HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1:     2Story           7           5      2003         2003     Gable  CompShg
## 2:     1Story           6           8      1976         1976     Gable  CompShg
## 3:     2Story           7           5      2001         2002     Gable  CompShg
## 4:     2Story           7           5      1915         1970     Gable  CompShg
## 5:     2Story           8           5      2000         2000     Gable  CompShg
## 6:     1.5Fin           5           5      1993         1995     Gable  CompShg
##    Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1:     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2:     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3:     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4:     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5:     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6:     VinylSd     VinylSd       None          0        TA        TA       Wood
##    BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1:       Gd       TA           No          GLQ        706          Unf
## 2:       Gd       TA           Gd          ALQ        978          Unf
## 3:       Gd       TA           Mn          GLQ        486          Unf
## 4:       TA       Gd           No          ALQ        216          Unf
## 5:       Gd       TA           Av          GLQ        655          Unf
## 6:       Gd       TA           No          GLQ        732          Unf
##    BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1:          0       150         856    GasA        Ex          Y      SBrkr
## 2:          0       284        1262    GasA        Ex          Y      SBrkr
## 3:          0       434         920    GasA        Ex          Y      SBrkr
## 4:          0       540         756    GasA        Gd          Y      SBrkr
## 5:          0       490        1145    GasA        Ex          Y      SBrkr
## 6:          0        64         796    GasA        Ex          Y      SBrkr
##    1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1:      856      854            0      1710            1            0        2
## 2:     1262        0            0      1262            0            1        2
## 3:      920      866            0      1786            1            0        2
## 4:      961      756            0      1717            1            0        1
## 5:     1145     1053            0      2198            1            0        2
## 6:      796      566            0      1362            1            0        1
##    HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1:        1            3            1          Gd            8        Typ
## 2:        0            3            1          TA            6        Typ
## 3:        1            3            1          Gd            6        Typ
## 4:        0            3            1          Gd            7        Typ
## 5:        1            4            1          Gd            9        Typ
## 6:        1            1            1          TA            5        Typ
##    Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1:          0        <NA>     Attchd        2003          RFn          2
## 2:          1          TA     Attchd        1976          RFn          2
## 3:          1          TA     Attchd        2001          RFn          2
## 4:          1          Gd     Detchd        1998          Unf          3
## 5:          1          TA     Attchd        2000          RFn          3
## 6:          0        <NA>     Attchd        1993          Unf          2
##    GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1:        548         TA         TA          Y          0          61
## 2:        460         TA         TA          Y        298           0
## 3:        608         TA         TA          Y          0          42
## 4:        642         TA         TA          Y          0          35
## 5:        836         TA         TA          Y        192          84
## 6:        480         TA         TA          Y         40          30
##    EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1:             0         0           0        0   <NA>  <NA>        <NA>
## 2:             0         0           0        0   <NA>  <NA>        <NA>
## 3:             0         0           0        0   <NA>  <NA>        <NA>
## 4:           272         0           0        0   <NA>  <NA>        <NA>
## 5:             0         0           0        0   <NA>  <NA>        <NA>
## 6:             0       320           0        0   <NA> MnPrv        Shed
##    MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1:       0      2   2008       WD        Normal    208500
## 2:       0      5   2007       WD        Normal    181500
## 3:       0      9   2008       WD        Normal    223500
## 4:       0      2   2006       WD       Abnorml    140000
## 5:       0     12   2008       WD        Normal    250000
## 6:     700     10   2009       WD        Normal    143000
head(test)
##      Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1: 1461         20       RH          80   11622   Pave  <NA>      Reg
## 2: 1462         20       RL          81   14267   Pave  <NA>      IR1
## 3: 1463         60       RL          74   13830   Pave  <NA>      IR1
## 4: 1464         60       RL          78    9978   Pave  <NA>      IR1
## 5: 1465        120       RL          43    5005   Pave  <NA>      IR1
## 6: 1466         60       RL          75   10000   Pave  <NA>      IR1
##    LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2
## 1:         Lvl    AllPub    Inside       Gtl        NAmes      Feedr       Norm
## 2:         Lvl    AllPub    Corner       Gtl        NAmes       Norm       Norm
## 3:         Lvl    AllPub    Inside       Gtl      Gilbert       Norm       Norm
## 4:         Lvl    AllPub    Inside       Gtl      Gilbert       Norm       Norm
## 5:         HLS    AllPub    Inside       Gtl      StoneBr       Norm       Norm
## 6:         Lvl    AllPub    Corner       Gtl      Gilbert       Norm       Norm
##    BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle
## 1:     1Fam     1Story           5           6      1961         1961     Gable
## 2:     1Fam     1Story           6           6      1958         1958       Hip
## 3:     1Fam     2Story           5           5      1997         1998     Gable
## 4:     1Fam     2Story           6           6      1998         1998     Gable
## 5:   TwnhsE     1Story           8           5      1992         1992     Gable
## 6:     1Fam     2Story           6           5      1993         1994     Gable
##    RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## 1:  CompShg     VinylSd     VinylSd       None          0        TA        TA
## 2:  CompShg     Wd Sdng     Wd Sdng    BrkFace        108        TA        TA
## 3:  CompShg     VinylSd     VinylSd       None          0        TA        TA
## 4:  CompShg     VinylSd     VinylSd    BrkFace         20        TA        TA
## 5:  CompShg     HdBoard     HdBoard       None          0        Gd        TA
## 6:  CompShg     HdBoard     HdBoard       None          0        TA        TA
##    Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 1:     CBlock       TA       TA           No          Rec        468
## 2:     CBlock       TA       TA           No          ALQ        923
## 3:      PConc       Gd       TA           No          GLQ        791
## 4:      PConc       TA       TA           No          GLQ        602
## 5:      PConc       Gd       TA           No          ALQ        263
## 6:      PConc       Gd       TA           No          Unf          0
##    BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## 1:          LwQ        144       270         882    GasA        TA          Y
## 2:          Unf          0       406        1329    GasA        TA          Y
## 3:          Unf          0       137         928    GasA        Gd          Y
## 4:          Unf          0       324         926    GasA        Ex          Y
## 5:          Unf          0      1017        1280    GasA        Ex          Y
## 6:          Unf          0       763         763    GasA        Gd          Y
##    Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## 1:      SBrkr      896        0            0       896            0
## 2:      SBrkr     1329        0            0      1329            0
## 3:      SBrkr      928      701            0      1629            0
## 4:      SBrkr      926      678            0      1604            0
## 5:      SBrkr     1280        0            0      1280            0
## 6:      SBrkr      763      892            0      1655            0
##    BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## 1:            0        1        0            2            1          TA
## 2:            0        1        1            3            1          Gd
## 3:            0        2        1            3            1          TA
## 4:            0        2        1            3            1          Gd
## 5:            0        2        0            2            1          Gd
## 6:            0        2        1            3            1          TA
##    TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 1:            5        Typ          0        <NA>     Attchd        1961
## 2:            6        Typ          0        <NA>     Attchd        1958
## 3:            6        Typ          1          TA     Attchd        1997
## 4:            7        Typ          1          Gd     Attchd        1998
## 5:            5        Typ          0        <NA>     Attchd        1992
## 6:            7        Typ          1          TA     Attchd        1993
##    GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive
## 1:          Unf          1        730         TA         TA          Y
## 2:          Unf          1        312         TA         TA          Y
## 3:          Fin          2        482         TA         TA          Y
## 4:          Fin          2        470         TA         TA          Y
## 5:          RFn          2        506         TA         TA          Y
## 6:          Fin          2        440         TA         TA          Y
##    WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC
## 1:        140           0             0         0         120        0   <NA>
## 2:        393          36             0         0           0        0   <NA>
## 3:        212          34             0         0           0        0   <NA>
## 4:        360          36             0         0           0        0   <NA>
## 5:          0          82             0         0         144        0   <NA>
## 6:        157          84             0         0           0        0   <NA>
##    Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 1: MnPrv        <NA>       0      6   2010       WD        Normal
## 2:  <NA>        Gar2   12500      6   2010       WD        Normal
## 3: MnPrv        <NA>       0      3   2010       WD        Normal
## 4:  <NA>        <NA>       0      6   2010       WD        Normal
## 5:  <NA>        <NA>       0      1   2010       WD        Normal
## 6:  <NA>        <NA>       0      4   2010       WD        Normal
# Divide character and numeric columns..
char_var <- names(train)[which(sapply(train, is.character))]
char_car <- c(char_var, 'BedroomAbvGr','HalfBath','KitchenAbvGr',
          'BsmtFullBath','BsmtHalfBath','MSSubClass')
numeric_var <- names(train)[which(sapply(train, is.numeric))]
#Data Size and Structure
dim(train)
## [1] 1460   81
str(train)
## Classes 'data.table' and 'data.frame':   1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ 1stFlrSF     : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ 2ndFlrSF     : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ 3SsnPorch    : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  - attr(*, ".internal.selfref")=<externalptr>
#str(train[,c(1:10, 81)]) #Shows Response Variable
#Summarize the Missing Values in the Data
colSums(sapply(train, is.na))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0           259             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1369             0             0             0 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             0             0 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             8             8             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            37            37            38            37             0 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            38             0             0             0             0 
##     HeatingQC    CentralAir    Electrical      1stFlrSF      2ndFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             0             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             0             0           690            81            81 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            81             0             0            81            81 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch     3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1453          1179          1406 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0
colSums(sapply(train[,.SD, .SDcols = char_var], is.na))
##      MSZoning        Street         Alley      LotShape   LandContour 
##             0             0          1369             0             0 
##     Utilities     LotConfig     LandSlope  Neighborhood    Condition1 
##             0             0             0             0             0 
##    Condition2      BldgType    HouseStyle     RoofStyle      RoofMatl 
##             0             0             0             0             0 
##   Exterior1st   Exterior2nd    MasVnrType     ExterQual     ExterCond 
##             0             0             8             0             0 
##    Foundation      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1 
##             0            37            37            38            37 
##  BsmtFinType2       Heating     HeatingQC    CentralAir    Electrical 
##            38             0             0             0             1 
##   KitchenQual    Functional   FireplaceQu    GarageType  GarageFinish 
##             0             0           690            81            81 
##    GarageQual    GarageCond    PavedDrive        PoolQC         Fence 
##            81            81             0          1453          1179 
##   MiscFeature      SaleType SaleCondition 
##          1406             0             0
colSums(sapply(train[,.SD, .SDcols = numeric_var], is.na))
##            Id    MSSubClass   LotFrontage       LotArea   OverallQual 
##             0             0           259             0             0 
##   OverallCond     YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1 
##             0             0             0             8             0 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF      1stFlrSF      2ndFlrSF 
##             0             0             0             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd    Fireplaces 
##             0             0             0             0             0 
##   GarageYrBlt    GarageCars    GarageArea    WoodDeckSF   OpenPorchSF 
##            81             0             0             0             0 
## EnclosedPorch     3SsnPorch   ScreenPorch      PoolArea       MiscVal 
##             0             0             0             0             0 
##        MoSold        YrSold     SalePrice 
##             0             0             0
#Visualizing for Missing Data
plot_Missing <- function(data_in, title = NULL){
  temp_df <- as.data.frame(ifelse(is.na(data_in), 0, 1))
  temp_df <- temp_df[,order(colSums(temp_df))]
  data_temp <- expand.grid(list(x = 1:nrow(temp_df), y = colnames(temp_df)))
  data_temp$m <- as.vector(as.matrix(temp_df))
  data_temp <- data.frame(x = unlist(data_temp$x), y = unlist(data_temp$y), m = unlist(data_temp$m))
  ggplot(data_temp) + geom_tile(aes(x=x, y=y, fill=factor(m))) + scale_fill_manual(values=c("green", "red"), name="Missing\n(0=Yes, 1=No)") + theme_light() + ylab("") + xlab("") + ggtitle(title)
}

plot_Missing(train[,colSums(is.na(train)) > 0, with = FALSE])

sum(train[,'YearRemodAdd', with = FALSE] != train[,'YearBuilt', with = FALSE])
## [1] 696
cat('Percentage of houses remodeled',sum(train[,'YearRemodAdd', with = FALSE] != train[,'YearBuilt', with = FALSE])/ dim(train)[1])
## Percentage of houses remodeled 0.4767123
train %>% select(YearBuilt, YearRemodAdd) %>% mutate(Remodeled = as.integer(YearBuilt != YearRemodAdd)) %>% ggplot(aes(x= factor(x = Remodeled,labels = c( 'No','Yes')))) + geom_bar() + xlab('Remodeled') + theme_light()

#Summarize the numeric values and the structure of the data.
summary(train[,.SD, .SDcols = numeric_var])
##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 59.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 69.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 70.05   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 80.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                   NA's   :259                      
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF        1stFlrSF       2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      3SsnPorch     
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
## 
cat('Train has', dim(train)[1], 'rows and', dim(train)[2], 'columns.')
## Train has 1460 rows and 81 columns.
cat('Test has', dim(test)[1], 'rows and', dim(test)[2], 'columns.')
## Test has 1459 rows and 80 columns.
# The Percentage of data missing in train.
sum(is.na(train)) / (nrow(train) * ncol(train))
## [1] 0.05889565
# The Percentage of data missing in test.
sum(is.na(test)) / (nrow(test) * ncol(test))
## [1] 0.05997258
#Check for duplicated rows
cat("The number of duplicated rows are", nrow(train) - nrow(unique(train)))
## The number of duplicated rows are 0
#Convert Character to factors
train[,(char_var) := lapply(.SD, as.factor), .SDcols = char_var]
train_cat <- train[,.SD, .SDcols = char_var]
train_cont <- train[,.SD,.SDcols = numeric_var]
plotHist <- function(data_in, i) {
  data <- data.frame(x=data_in[[i]])
  p <- ggplot(data = data, aes(x=factor(x))) + stat_count() + xlab(colnames(data_in)[i]) + theme_light() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
  return (p)
}

doPlots <- function(data_in, fun, ii, ncol = 3) {
  pp <- list()
  for (i in ii) {
    p <- fun(data_in = data_in, i = i)
    pp <- c(pp, list(p))
  }
  do.call("grid.arrange", c(pp, ncol = ncol))
}

plotDen <- function(data_in, i){
  data <- data.frame(x = data_in[[i]], SalePrice = data_in$SalePrice)
  p <- ggplot(data = data) + geom_line(aes(x = x), stat = 'density', size = 1,alpha = 1.0) +
    xlab(paste0((colnames(data_in)[i]), '\n', 'Skewness: ',round(skewness(data_in[[i]], na.rm = TRUE), 2))) + theme_light() 
  return(p)
}
#Bar plots for the categorical features
doPlots(train_cat, fun = plotHist, ii = 1:4, ncol = 2)

doPlots(train_cat, fun = plotHist, ii  = 4:8, ncol = 2)

doPlots(train_cat, fun = plotHist, ii = 8:12, ncol = 2)

doPlots(train_cat, fun = plotHist, ii = 13:18, ncol = 2)

doPlots(train_cat, fun = plotHist, ii = 18:22, ncol = 2)

train %>% select(LandSlope, Neighborhood, SalePrice) %>% filter(LandSlope == c('Sev', 'Mod')) %>% arrange(Neighborhood) %>% group_by(Neighborhood, LandSlope) %>% summarize(Count = n()) %>% ggplot(aes(Neighborhood, Count)) + geom_bar(aes(fill = LandSlope), position = 'dodge', stat = 'identity') + theme_light() +theme(axis.text.x = element_text(angle = 90, hjust =1))

train %>% select(Neighborhood, SalePrice) %>% ggplot(aes(factor(Neighborhood), SalePrice)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust =1)) + xlab('Neighborhoods')

#Density plots for numeric variables.
doPlots(train_cont, fun = plotDen, ii = 2:6, ncol = 2)
## Warning: Removed 259 rows containing non-finite values (stat_density).

doPlots(train_cont, fun = plotDen, ii = 7:12, ncol = 2)
## Warning: Removed 8 rows containing non-finite values (stat_density).

doPlots(train_cont, fun = plotDen, ii = 13:17, ncol = 2)

doPlots(train_cont, fun = plotHist, ii = 18:23, ncol = 2)

#Explore the correlation
correlations <- cor(na.omit(train_cont[,-1, with = FALSE]))

# correlations
row_indic <- apply(correlations, 1, function(x) sum(x > 0.3 | x < -0.3) > 1)

correlations<- correlations[row_indic ,row_indic ]
corrplot(correlations, method="square")

#Plot scatter plot for variables that have high correlation.
train %>% select(OverallCond, YearBuilt) %>% ggplot(aes(factor(OverallCond),YearBuilt)) + geom_boxplot() + xlab('Overall Condition')

plotCorr <- function(data_in, i){
  data <- data.frame(x = data_in[[i]], SalePrice = data_in$SalePrice)
  p <- ggplot(data, aes(x = x, y = SalePrice)) + geom_point(shape = 1, na.rm = TRUE) + geom_smooth(method = lm ) + xlab(paste0(colnames(data_in)[i], '\n', 'R-Squared: ', round(cor(data_in[[i]], data$SalePrice, use = 'complete.obs'), 2))) + theme_light()
  return(suppressWarnings(p))
}

highcorr <- c(names(correlations[,'SalePrice'])[which(correlations[,'SalePrice'] > 0.5)], names(correlations[,'SalePrice'])[which(correlations[,'SalePrice'] < -0.2)])

data_corr <- train[,highcorr, with = FALSE]

doPlots(data_corr, fun = plotCorr, ii = 1:6)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

doPlots(data_corr, fun = plotCorr, ii = 6:11)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 81 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

library(scales)
ggplot(train, aes(x=SalePrice)) + geom_histogram(col = 'white') + theme_light() +scale_x_continuous(labels = comma)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(train[,.(SalePrice)])
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000
#Normalize distribution
ggplot(train, aes(x=log(SalePrice+1))) + geom_histogram(col = 'white') + theme_light()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.