R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# load tain_data
train_data <- read.csv("G:/kaggle/houseprice/data/train.csv")
# get hang of overall
head(train_data, n = 10)
##    Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1   1         60       RL          65    8450   Pave  <NA>      Reg
## 2   2         20       RL          80    9600   Pave  <NA>      Reg
## 3   3         60       RL          68   11250   Pave  <NA>      IR1
## 4   4         70       RL          60    9550   Pave  <NA>      IR1
## 5   5         60       RL          84   14260   Pave  <NA>      IR1
## 6   6         50       RL          85   14115   Pave  <NA>      IR1
## 7   7         20       RL          75   10084   Pave  <NA>      Reg
## 8   8         60       RL          NA   10382   Pave  <NA>      IR1
## 9   9         50       RM          51    6120   Pave  <NA>      Reg
## 10 10        190       RL          50    7420   Pave  <NA>      Reg
##    LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1          Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2          Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3          Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4          Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5          Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6          Lvl    AllPub    Inside       Gtl      Mitchel       Norm
## 7          Lvl    AllPub    Inside       Gtl      Somerst       Norm
## 8          Lvl    AllPub    Corner       Gtl       NWAmes       PosN
## 9          Lvl    AllPub    Inside       Gtl      OldTown     Artery
## 10         Lvl    AllPub    Corner       Gtl      BrkSide     Artery
##    Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1        Norm     1Fam     2Story           7           5      2003
## 2        Norm     1Fam     1Story           6           8      1976
## 3        Norm     1Fam     2Story           7           5      2001
## 4        Norm     1Fam     2Story           7           5      1915
## 5        Norm     1Fam     2Story           8           5      2000
## 6        Norm     1Fam     1.5Fin           5           5      1993
## 7        Norm     1Fam     1Story           8           5      2004
## 8        Norm     1Fam     2Story           7           6      1973
## 9        Norm     1Fam     1.5Fin           7           5      1931
## 10     Artery   2fmCon     1.5Unf           5           6      1939
##    YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1          2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2          1976     Gable  CompShg     MetalSd     MetalSd       None
## 3          2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4          1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5          2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6          1995     Gable  CompShg     VinylSd     VinylSd       None
## 7          2005     Gable  CompShg     VinylSd     VinylSd      Stone
## 8          1973     Gable  CompShg     HdBoard     HdBoard      Stone
## 9          1950     Gable  CompShg     BrkFace     Wd Shng       None
## 10         1950     Gable  CompShg     MetalSd     MetalSd       None
##    MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 1         196        Gd        TA      PConc       Gd       TA
## 2           0        TA        TA     CBlock       Gd       TA
## 3         162        Gd        TA      PConc       Gd       TA
## 4           0        TA        TA     BrkTil       TA       Gd
## 5         350        Gd        TA      PConc       Gd       TA
## 6           0        TA        TA       Wood       Gd       TA
## 7         186        Gd        TA      PConc       Ex       TA
## 8         240        TA        TA     CBlock       Gd       TA
## 9           0        TA        TA     BrkTil       TA       TA
## 10          0        TA        TA     BrkTil       TA       TA
##    BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## 1            No          GLQ        706          Unf          0       150
## 2            Gd          ALQ        978          Unf          0       284
## 3            Mn          GLQ        486          Unf          0       434
## 4            No          ALQ        216          Unf          0       540
## 5            Av          GLQ        655          Unf          0       490
## 6            No          GLQ        732          Unf          0        64
## 7            Av          GLQ       1369          Unf          0       317
## 8            Mn          ALQ        859          BLQ         32       216
## 9            No          Unf          0          Unf          0       952
## 10           No          GLQ        851          Unf          0       140
##    TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 1          856    GasA        Ex          Y      SBrkr       856       854
## 2         1262    GasA        Ex          Y      SBrkr      1262         0
## 3          920    GasA        Ex          Y      SBrkr       920       866
## 4          756    GasA        Gd          Y      SBrkr       961       756
## 5         1145    GasA        Ex          Y      SBrkr      1145      1053
## 6          796    GasA        Ex          Y      SBrkr       796       566
## 7         1686    GasA        Ex          Y      SBrkr      1694         0
## 8         1107    GasA        Ex          Y      SBrkr      1107       983
## 9          952    GasA        Gd          Y      FuseF      1022       752
## 10         991    GasA        Ex          Y      SBrkr      1077         0
##    LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1             0      1710            1            0        2        1
## 2             0      1262            0            1        2        0
## 3             0      1786            1            0        2        1
## 4             0      1717            1            0        1        0
## 5             0      2198            1            0        2        1
## 6             0      1362            1            0        1        1
## 7             0      1694            1            0        2        0
## 8             0      2090            1            0        2        1
## 9             0      1774            0            0        2        0
## 10            0      1077            1            0        1        0
##    BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1             3            1          Gd            8        Typ
## 2             3            1          TA            6        Typ
## 3             3            1          Gd            6        Typ
## 4             3            1          Gd            7        Typ
## 5             4            1          Gd            9        Typ
## 6             1            1          TA            5        Typ
## 7             3            1          Gd            7        Typ
## 8             3            1          TA            7        Typ
## 9             2            2          TA            8       Min1
## 10            2            2          TA            5        Typ
##    Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1           0        <NA>     Attchd        2003          RFn          2
## 2           1          TA     Attchd        1976          RFn          2
## 3           1          TA     Attchd        2001          RFn          2
## 4           1          Gd     Detchd        1998          Unf          3
## 5           1          TA     Attchd        2000          RFn          3
## 6           0        <NA>     Attchd        1993          Unf          2
## 7           1          Gd     Attchd        2004          RFn          2
## 8           2          TA     Attchd        1973          RFn          2
## 9           2          TA     Detchd        1931          Unf          2
## 10          2          TA     Attchd        1939          RFn          1
##    GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1         548         TA         TA          Y          0          61
## 2         460         TA         TA          Y        298           0
## 3         608         TA         TA          Y          0          42
## 4         642         TA         TA          Y          0          35
## 5         836         TA         TA          Y        192          84
## 6         480         TA         TA          Y         40          30
## 7         636         TA         TA          Y        255          57
## 8         484         TA         TA          Y        235         204
## 9         468         Fa         TA          Y         90           0
## 10        205         Gd         TA          Y          0           4
##    EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1              0          0           0        0   <NA>  <NA>        <NA>
## 2              0          0           0        0   <NA>  <NA>        <NA>
## 3              0          0           0        0   <NA>  <NA>        <NA>
## 4            272          0           0        0   <NA>  <NA>        <NA>
## 5              0          0           0        0   <NA>  <NA>        <NA>
## 6              0        320           0        0   <NA> MnPrv        Shed
## 7              0          0           0        0   <NA>  <NA>        <NA>
## 8            228          0           0        0   <NA>  <NA>        Shed
## 9            205          0           0        0   <NA>  <NA>        <NA>
## 10             0          0           0        0   <NA>  <NA>        <NA>
##    MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1        0      2   2008       WD        Normal    208500
## 2        0      5   2007       WD        Normal    181500
## 3        0      9   2008       WD        Normal    223500
## 4        0      2   2006       WD       Abnorml    140000
## 5        0     12   2008       WD        Normal    250000
## 6      700     10   2009       WD        Normal    143000
## 7        0      8   2007       WD        Normal    307000
## 8      350     11   2009       WD        Normal    200000
## 9        0      4   2008       WD       Abnorml    129900
## 10       0      1   2008       WD        Normal    118000
str(train_data)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Alley        : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
##  $ LotShape     : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
##  $ LandContour  : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotConfig    : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LandSlope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2   : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior1st  : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ Exterior2nd  : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
##  $ MasVnrType   : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
##  $ GarageType   : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ PavedDrive   : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
##  $ Fence        : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
##  $ MiscFeature  : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
summary(train_data)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   RH     :  16   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                  Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50   
##  Median :  9478               NA's:1369   IR3: 10   Low:  36   
##  Mean   : 10517                           Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                
##  Max.   :215245                                                
##                                                                
##   Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub:1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  NoSeWa:   1   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##                FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                FR3    :   4              Edwards:100   RRAn   :  26  
##                Inside :1052              Somerst: 86   PosN   :  19  
##                                          Gilbert: 79   RRAe   :  11  
##                                          (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual    
##  Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.000  
##  (Other):   2                 (Other): 19                   
##   OverallCond      YearBuilt     YearRemodAdd    RoofStyle   
##  Min.   :1.000   Min.   :1872   Min.   :1950   Flat   :  13  
##  1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Gable  :1141  
##  Median :5.000   Median :1973   Median :1994   Gambrel:  11  
##  Mean   :5.575   Mean   :1971   Mean   :1985   Hip    : 286  
##  3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004   Mansard:   7  
##  Max.   :9.000   Max.   :2010   Max.   :2010   Shed   :   2  
##                                                              
##     RoofMatl     Exterior1st   Exterior2nd    MasVnrType    MasVnrArea    
##  CompShg:1434   VinylSd:515   VinylSd:504   BrkCmn : 15   Min.   :   0.0  
##  Tar&Grv:  11   HdBoard:222   MetalSd:214   BrkFace:445   1st Qu.:   0.0  
##  WdShngl:   6   MetalSd:220   HdBoard:207   None   :864   Median :   0.0  
##  WdShake:   5   Wd Sdng:206   Wd Sdng:197   Stone  :128   Mean   : 103.7  
##  ClyTile:   1   Plywood:108   Plywood:142   NA's   :  8   3rd Qu.: 166.0  
##  Membran:   1   CemntBd: 61   CmentBd: 60                 Max.   :1600.0  
##  (Other):   2   (Other):128   (Other):136                 NA's   :8       
##  ExterQual ExterCond  Foundation  BsmtQual   BsmtCond    BsmtExposure
##  Ex: 52    Ex:   3   BrkTil:146   Ex  :121   Fa  :  45   Av  :221    
##  Fa: 14    Fa:  28   CBlock:634   Fa  : 35   Gd  :  65   Gd  :134    
##  Gd:488    Gd: 146   PConc :647   Gd  :618   Po  :   2   Mn  :114    
##  TA:906    Po:   1   Slab  : 24   TA  :649   TA  :1311   No  :953    
##            TA:1282   Stone :  6   NA's: 37   NA's:  37   NA's: 38    
##                      Wood  :  3                                      
##                                                                      
##  BsmtFinType1   BsmtFinSF1     BsmtFinType2   BsmtFinSF2     
##  ALQ :220     Min.   :   0.0   ALQ :  19    Min.   :   0.00  
##  BLQ :148     1st Qu.:   0.0   BLQ :  33    1st Qu.:   0.00  
##  GLQ :418     Median : 383.5   GLQ :  14    Median :   0.00  
##  LwQ : 74     Mean   : 443.6   LwQ :  46    Mean   :  46.55  
##  Rec :133     3rd Qu.: 712.2   Rec :  54    3rd Qu.:   0.00  
##  Unf :430     Max.   :5644.0   Unf :1256    Max.   :1474.00  
##  NA's: 37                      NA's:  38                     
##    BsmtUnfSF       TotalBsmtSF      Heating     HeatingQC CentralAir
##  Min.   :   0.0   Min.   :   0.0   Floor:   1   Ex:741    N:  95    
##  1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428   Fa: 49    Y:1365    
##  Median : 477.5   Median : 991.5   GasW :  18   Gd:241              
##  Mean   : 567.2   Mean   :1057.4   Grav :   7   Po:  1              
##  3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2   TA:428              
##  Max.   :2336.0   Max.   :6110.0   Wall :   4                       
##                                                                     
##  Electrical     X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  FuseA:  94   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  FuseF:  27   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  FuseP:   3   Median :1087   Median :   0   Median :  0.000  
##  Mix  :   1   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  SBrkr:1334   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  NA's :   1   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                              
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex:100     
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa: 39     
##  Median :0.0000   Median :3.000   Median :1.000   Gd:586     
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   TA:735     
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000              
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000              
##                                                              
##   TotRmsAbvGrd    Functional    Fireplaces    FireplaceQu   GarageType 
##  Min.   : 2.000   Maj1:  14   Min.   :0.000   Ex  : 24    2Types :  6  
##  1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000   Fa  : 33    Attchd :870  
##  Median : 6.000   Min1:  31   Median :1.000   Gd  :380    Basment: 19  
##  Mean   : 6.518   Min2:  34   Mean   :0.613   Po  : 20    BuiltIn: 88  
##  3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000   TA  :313    CarPort:  9  
##  Max.   :14.000   Sev :   1   Max.   :3.000   NA's:690    Detchd :387  
##                   Typ :1360                               NA's   : 81  
##   GarageYrBlt   GarageFinish   GarageCars      GarageArea     GarageQual 
##  Min.   :1900   Fin :352     Min.   :0.000   Min.   :   0.0   Ex  :   3  
##  1st Qu.:1961   RFn :422     1st Qu.:1.000   1st Qu.: 334.5   Fa  :  48  
##  Median :1980   Unf :605     Median :2.000   Median : 480.0   Gd  :  14  
##  Mean   :1979   NA's: 81     Mean   :1.767   Mean   : 473.0   Po  :   3  
##  3rd Qu.:2002                3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1311  
##  Max.   :2010                Max.   :4.000   Max.   :1418.0   NA's:  81  
##  NA's   :81                                                              
##  GarageCond  PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Ex  :   2   N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Fa  :  35   P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Gd  :   9   Y:1340     Median :  0.00   Median : 25.00   Median :  0.00  
##  Po  :   7              Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##  TA  :1326              3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##  NA's:  81              Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                           
##    X3SsnPorch      ScreenPorch        PoolArea        PoolQC    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Ex  :   2  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2  
##  Median :  0.00   Median :  0.00   Median :  0.000   Gd  :   3  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759   NA's:1453  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000              
##  Max.   :508.00   Max.   :480.00   Max.   :738.000              
##                                                                 
##    Fence      MiscFeature    MiscVal             MoSold      
##  GdPrv:  59   Gar2:   2   Min.   :    0.00   Min.   : 1.000  
##  GdWo :  54   Othr:   2   1st Qu.:    0.00   1st Qu.: 5.000  
##  MnPrv: 157   Shed:  49   Median :    0.00   Median : 6.000  
##  MnWw :  11   TenC:   1   Mean   :   43.49   Mean   : 6.322  
##  NA's :1179   NA's:1406   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                           Max.   :15500.00   Max.   :12.000  
##                                                              
##      YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   :2006   WD     :1267   Abnorml: 101   Min.   : 34900  
##  1st Qu.:2007   New    : 122   AdjLand:   4   1st Qu.:129975  
##  Median :2008   COD    :  43   Alloca :  12   Median :163000  
##  Mean   :2008   ConLD  :   9   Family :  20   Mean   :180921  
##  3rd Qu.:2009   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  Max.   :2010   ConLw  :   5   Partial: 125   Max.   :755000  
##                 (Other):   9
# identify which column contains NA element
sort(colSums(sapply(train_data, is.na)),decreasing = TRUE)
##        PoolQC   MiscFeature         Alley         Fence   FireplaceQu 
##          1453          1406          1369          1179           690 
##   LotFrontage    GarageType   GarageYrBlt  GarageFinish    GarageQual 
##           259            81            81            81            81 
##    GarageCond  BsmtExposure  BsmtFinType2      BsmtQual      BsmtCond 
##            81            38            38            37            37 
##  BsmtFinType1    MasVnrType    MasVnrArea    Electrical            Id 
##            37             8             8             1             0 
##    MSSubClass      MSZoning       LotArea        Street      LotShape 
##             0             0             0             0             0 
##   LandContour     Utilities     LotConfig     LandSlope  Neighborhood 
##             0             0             0             0             0 
##    Condition1    Condition2      BldgType    HouseStyle   OverallQual 
##             0             0             0             0             0 
##   OverallCond     YearBuilt  YearRemodAdd     RoofStyle      RoofMatl 
##             0             0             0             0             0 
##   Exterior1st   Exterior2nd     ExterQual     ExterCond    Foundation 
##             0             0             0             0             0 
##    BsmtFinSF1    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##             0             0             0             0             0 
##     HeatingQC    CentralAir     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##             0             0             0             0             0 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##             0             0             0             0             0 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##             0             0             0             0             0 
##    Fireplaces    GarageCars    GarageArea    PavedDrive    WoodDeckSF 
##             0             0             0             0             0 
##   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch      PoolArea 
##             0             0             0             0             0 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0
# deal with NA elements
# (1) actually missing (2) add new level
# (3) imputation (4) remove all rows with NA
# identify categorical features and numeric ones
# we assume that if num of unique values of certain variables < 20, 
# it's also catergorical 
cat_num <- NULL

for(col in train_data){
  if(is.factor(col)){
    cat_num <- c(cat_num,'cat')
  }else if(length(unique(col)) < 20){ 
    cat_num <- c(cat_num,"num_cat")
  }else{
    cat_num <- c(cat_num,"num")
  }
}

# write.table(fea_list,
          # file = "G:/kaggle/houseprice/House_Price_Feature_List_to_be_filled.csv")
# train_data_num <- train_data[sapply(train_data, is.numeric)]
# keep record of type of each  variable
library(xlsx) #load the package
## Loading required package: rJava
## Loading required package: xlsxjars
fea_list <- read.xlsx(
  file = "G:/kaggle/houseprice/House_Price_Feature_List_to_be_filled.xlsx",
  sheetName = "Sheet1")
# head(fea_list)
fea_list$type <- cat_num
write.csv(fea_list,
      file = "G:/kaggle/houseprice/House_Price_Feature_List_to_be_filled.csv"
      )
# check above if assign numerical features into categorical 
colnames(train_data)[cat_num == "num_cat"]
##  [1] "MSSubClass"   "OverallQual"  "OverallCond"  "BsmtFullBath"
##  [5] "BsmtHalfBath" "FullBath"     "HalfBath"     "BedroomAbvGr"
##  [9] "KitchenAbvGr" "TotRmsAbvGrd" "Fireplaces"   "GarageCars"  
## [13] "PoolArea"     "MoSold"       "YrSold"
# head(train_data$BsmtFinSF2,n = 20)
# get columns' names
num_fea_names <- colnames(train_data)[cat_num == "num"][2:23]
num_cat_names <- colnames(train_data)[cat_num == "num_cat"]
cat_fea_names <- colnames(train_data)[cat_num == "cat"]
# get more details, distribution of features
# for catergorical features
# table(), plot(table(()))
# for numerical features
# statistic, boxplot(), plot(density())
# simply analysis correlation between numeric features and prediction
library(corrplot)
correlation <- cor(train_data[,num_fea_names],use = "pairwise.complete.obs")

# head(correlation)
rowId <- apply(correlation, 1, function(x){
  return (x[dim(correlation)[2]] > 0.5 | x[dim(correlation)[2]] < -0.5)
})
correlation[rowId,rowId]
##              YearBuilt YearRemodAdd TotalBsmtSF X1stFlrSF GrLivArea
## YearBuilt    1.0000000    0.5928550   0.3914520 0.2819859 0.1990097
## YearRemodAdd 0.5928550    1.0000000   0.2910656 0.2403793 0.2873885
## TotalBsmtSF  0.3914520    0.2910656   1.0000000 0.8195300 0.4548682
## X1stFlrSF    0.2819859    0.2403793   0.8195300 1.0000000 0.5660240
## GrLivArea    0.1990097    0.2873885   0.4548682 0.5660240 1.0000000
## GarageArea   0.4789538    0.3715998   0.4866655 0.4897817 0.4689975
## SalePrice    0.5228973    0.5071010   0.6135806 0.6058522 0.7086245
##              GarageArea SalePrice
## YearBuilt     0.4789538 0.5228973
## YearRemodAdd  0.3715998 0.5071010
## TotalBsmtSF   0.4866655 0.6135806
## X1stFlrSF     0.4897817 0.6058522
## GrLivArea     0.4689975 0.7086245
## GarageArea    1.0000000 0.6234314
## SalePrice     0.6234314 1.0000000
corrplot(correlation[rowId,rowId], method = "square")

# simply analysis correlation between numerical category features with prediction

for(cat_num in num_cat_names){
  boxplot(
    subset(train_data, train_data[,cat_num] >= mean(train_data[,cat_num],na.rm = TRUE))$SalePrice,
    subset(train_data, train_data[,cat_num] < mean(train_data[,cat_num],na.rm = TRUE))$SalePrice,
    xlab = cat_num, ylab = "SalePrice"
  )
}

# simply analysis correlation between categorical features and prediction
# library(tabplot)
# library(lattice)
# bwplot(Neighborhood ~ SalePrice, data = train_data)
for(name in cat_fea_names){
  plot(x = train_data[,name], y = train_data[,"SalePrice"],
       xlab = name, ylab = "SalePrice")
}

# feature engineering

# how many years are these houses
Age <- 2017 - train_data[,"YearBuilt"]
cor(Age, train_data$SalePrice,use = "pairwise.complete.obs")
## [1] -0.5228973
# total Floor square feet
tot_Flo_area <- train_data$X1stFlrSF + train_data$X2ndFlrSF
cor(tot_Flo_area, train_data$SalePrice,use = "pairwise.complete.obs")
## [1] 0.7168831
# total bathroom
#TotalBath
totalBath <- with(train_data, BsmtFullBath + 0.5 * BsmtHalfBath + FullBath + 0.5 * HalfBath)
with(train_data,cor(totalBath,SalePrice))
## [1] 0.6317311
# how many years garages are
Age_gar <- 2017 - train_data$GarageYrBlt
with(train_data,cor(Age_gar,SalePrice,use = "pairwise.complete.obs"))
## [1] -0.4863617
# total number of rooms
tot_room <- train_data$TotRmsAbvGrd + train_data$BedroomAbvGr
cor(tot_room, train_data$SalePrice,use = "pairwise.complete.obs")
## [1] 0.4448281
# percentage of living area to total square feet
per_liv <- train_data$GrLivArea / tot_Flo_area
cor(per_liv, train_data$SalePrice,use = "pairwise.complete.obs")
## [1] -0.0626086
# how many years house last since repairing
rep_yea <- 2017 - train_data$YearRemodAdd
cor(rep_yea, train_data$SalePrice,use = "pairwise.complete.obs")
## [1] -0.507101
# hom many years houses last until repaire
bef_rep_yea <- train_data$YearRemodAdd - train_data$YearBuilt
cor(bef_rep_yea, train_data$SalePrice,use = "pairwise.complete.obs")
## [1] -0.2175033