数据读取

train <- read.csv("train.csv")
test <- read.csv("test.csv")
dim(train)
## [1] 1460   81
dim(test)
## [1] 1459   80
str(train)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

数据清洗

unique(train$MiscFeature)
## [1] NA     "Shed" "Gar2" "Othr" "TenC"
unique(test$MiscFeature)
## [1] NA     "Gar2" "Shed" "Othr"

从上面的结果可以看出,训练集和测试集某些特征中所含有的字符串并不相同,针对这一情况,需要处理,即在定义因子变量的时候,需要将训练集和测试集他的相同特征同时定义为因子变量,来保证它们的水平个数相同。

定义字符串为因子变量

# 将训练集中的目标变量剔除
SalePrice <- train$SalePrice 
train$SalePrice <- NULL

# Combine data sets
## 组合训练集和测试集
full_data = rbind(train,test)

# Convert character columns to factor, filling NA values with "missing"
## 将字符串转化为因子变量,并且使用“missing”填充缺失值
for(col in colnames(full_data)){
  ## 如果变量为字符串就转化为因子变量
  if (typeof(full_data[,col]) == "character"){
    new_col <- full_data[,col]
    new_col[is.na(new_col)] <- "missing"
    full_data[col] <- as.factor(new_col)
  }
}

# Separate out our train and test sets
## 切分我们的训练集和测试集
train <- full_data[1:nrow(train),]
train$SalePrice <- SalePrice  
test <- full_data[(nrow(train)+1):nrow(full_data),]

summary(train)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   missing:   0   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RH     :  16   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RL     :1151   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0   RM     : 218   Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street         Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl   :  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   missing:1369   IR2: 41   HLS:  50   
##  Median :  9478               Pave   :  41   IR3: 10   Low:  36   
##  Mean   : 10517                              Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                   
##  Max.   :215245                                                   
##                                                                   
##    Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub :1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  missing:   0   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##  NoSeWa :   1   FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                 FR3    :   4              Edwards:100   RRAn   :  26  
##                 Inside :1052              Somerst: 86   PosN   :  19  
##                                           Gilbert: 79   RRAe   :  11  
##                                           (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual    
##  Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.000  
##  (Other):   2                 (Other): 19                   
##   OverallCond      YearBuilt     YearRemodAdd    RoofStyle   
##  Min.   :1.000   Min.   :1872   Min.   :1950   Flat   :  13  
##  1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Gable  :1141  
##  Median :5.000   Median :1973   Median :1994   Gambrel:  11  
##  Mean   :5.575   Mean   :1971   Mean   :1985   Hip    : 286  
##  3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004   Mansard:   7  
##  Max.   :9.000   Max.   :2010   Max.   :2010   Shed   :   2  
##                                                              
##     RoofMatl     Exterior1st   Exterior2nd    MasVnrType    MasVnrArea    
##  CompShg:1434   VinylSd:515   VinylSd:504   BrkCmn : 15   Min.   :   0.0  
##  Tar&Grv:  11   HdBoard:222   MetalSd:214   BrkFace:445   1st Qu.:   0.0  
##  WdShngl:   6   MetalSd:220   HdBoard:207   missing:  8   Median :   0.0  
##  WdShake:   5   Wd Sdng:206   Wd Sdng:197   None   :864   Mean   : 103.7  
##  ClyTile:   1   Plywood:108   Plywood:142   Stone  :128   3rd Qu.: 166.0  
##  Membran:   1   CemntBd: 61   CmentBd: 60                 Max.   :1600.0  
##  (Other):   2   (Other):128   (Other):136                 NA's   :8       
##  ExterQual ExterCond  Foundation     BsmtQual      BsmtCond   
##  Ex: 52    Ex:   3   BrkTil:146   Ex     :121   Fa     :  45  
##  Fa: 14    Fa:  28   CBlock:634   Fa     : 35   Gd     :  65  
##  Gd:488    Gd: 146   PConc :647   Gd     :618   missing:  37  
##  TA:906    Po:   1   Slab  : 24   missing: 37   Po     :   2  
##            TA:1282   Stone :  6   TA     :649   TA     :1311  
##                      Wood  :  3                               
##                                                               
##   BsmtExposure  BsmtFinType1   BsmtFinSF1      BsmtFinType2 
##  Av     :221   ALQ    :220   Min.   :   0.0   ALQ    :  19  
##  Gd     :134   BLQ    :148   1st Qu.:   0.0   BLQ    :  33  
##  missing: 38   GLQ    :418   Median : 383.5   GLQ    :  14  
##  Mn     :114   LwQ    : 74   Mean   : 443.6   LwQ    :  46  
##  No     :953   missing: 37   3rd Qu.: 712.2   missing:  38  
##                Rec    :133   Max.   :5644.0   Rec    :  54  
##                Unf    :430                    Unf    :1256  
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF      Heating    
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Floor:   1  
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428  
##  Median :   0.00   Median : 477.5   Median : 991.5   GasW :  18  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4   Grav :   7  
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2  
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0   Wall :   4  
##                                                                  
##  HeatingQC CentralAir   Electrical     X1stFlrSF      X2ndFlrSF   
##  Ex:741    N:  95     FuseA  :  94   Min.   : 334   Min.   :   0  
##  Fa: 49    Y:1365     FuseF  :  27   1st Qu.: 882   1st Qu.:   0  
##  Gd:241               FuseP  :   3   Median :1087   Median :   0  
##  Po:  1               missing:   1   Mean   :1163   Mean   : 347  
##  TA:428               Mix    :   1   3rd Qu.:1391   3rd Qu.: 728  
##                       SBrkr  :1334   Max.   :4692   Max.   :2065  
##                                                                   
##   LowQualFinSF       GrLivArea     BsmtFullBath     BsmtHalfBath    
##  Min.   :  0.000   Min.   : 334   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  0.000   Median :1464   Median :0.0000   Median :0.00000  
##  Mean   :  5.845   Mean   :1515   Mean   :0.4253   Mean   :0.05753  
##  3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :572.000   Max.   :5642   Max.   :3.0000   Max.   :2.00000  
##                                                                     
##     FullBath        HalfBath       BedroomAbvGr    KitchenAbvGr  
##  Min.   :0.000   Min.   :0.0000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :2.000   Median :0.0000   Median :3.000   Median :1.000  
##  Mean   :1.565   Mean   :0.3829   Mean   :2.866   Mean   :1.047  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000  
##  Max.   :3.000   Max.   :2.0000   Max.   :8.000   Max.   :3.000  
##                                                                  
##   KitchenQual   TotRmsAbvGrd      Functional     Fireplaces   
##  Ex     :100   Min.   : 2.000   Typ    :1360   Min.   :0.000  
##  Fa     : 39   1st Qu.: 5.000   Min2   :  34   1st Qu.:0.000  
##  Gd     :586   Median : 6.000   Min1   :  31   Median :1.000  
##  missing:  0   Mean   : 6.518   Mod    :  15   Mean   :0.613  
##  TA     :735   3rd Qu.: 7.000   Maj1   :  14   3rd Qu.:1.000  
##                Max.   :14.000   Maj2   :   5   Max.   :3.000  
##                                 (Other):   1                  
##   FireplaceQu    GarageType   GarageYrBlt    GarageFinish   GarageCars   
##  Ex     : 24   2Types :  6   Min.   :1900   Fin    :352   Min.   :0.000  
##  Fa     : 33   Attchd :870   1st Qu.:1961   missing: 81   1st Qu.:1.000  
##  Gd     :380   Basment: 19   Median :1980   RFn    :422   Median :2.000  
##  missing:690   BuiltIn: 88   Mean   :1979   Unf    :605   Mean   :1.767  
##  Po     : 20   CarPort:  9   3rd Qu.:2002                 3rd Qu.:2.000  
##  TA     :313   Detchd :387   Max.   :2010                 Max.   :4.000  
##                missing: 81   NA's   :81                                  
##    GarageArea       GarageQual     GarageCond   PavedDrive
##  Min.   :   0.0   Ex     :   3   Ex     :   2   N:  90    
##  1st Qu.: 334.5   Fa     :  48   Fa     :  35   P:  30    
##  Median : 480.0   Gd     :  14   Gd     :   9   Y:1340    
##  Mean   : 473.0   missing:  81   missing:  81             
##  3rd Qu.: 576.0   Po     :   3   Po     :   7             
##  Max.   :1418.0   TA     :1311   TA     :1326             
##                                                           
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea           PoolQC         Fence     
##  Min.   :  0.00   Min.   :  0.000   Ex     :   2   GdPrv  :  59  
##  1st Qu.:  0.00   1st Qu.:  0.000   Fa     :   2   GdWo   :  54  
##  Median :  0.00   Median :  0.000   Gd     :   3   missing:1179  
##  Mean   : 15.06   Mean   :  2.759   missing:1453   MnPrv  : 157  
##  3rd Qu.:  0.00   3rd Qu.:  0.000                  MnWw   :  11  
##  Max.   :480.00   Max.   :738.000                                
##                                                                  
##   MiscFeature      MiscVal             MoSold           YrSold    
##  Gar2   :   2   Min.   :    0.00   Min.   : 1.000   Min.   :2006  
##  missing:1406   1st Qu.:    0.00   1st Qu.: 5.000   1st Qu.:2007  
##  Othr   :   2   Median :    0.00   Median : 6.000   Median :2008  
##  Shed   :  49   Mean   :   43.49   Mean   : 6.322   Mean   :2008  
##  TenC   :   1   3rd Qu.:    0.00   3rd Qu.: 8.000   3rd Qu.:2009  
##                 Max.   :15500.00   Max.   :12.000   Max.   :2010  
##                                                                   
##     SaleType    SaleCondition    SalePrice     
##  WD     :1267   Abnorml: 101   Min.   : 34900  
##  New    : 122   AdjLand:   4   1st Qu.:129975  
##  COD    :  43   Alloca :  12   Median :163000  
##  ConLD  :   9   Family :  20   Mean   :180921  
##  ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  ConLw  :   5   Partial: 125   Max.   :755000  
##  (Other):   9

缺失值处理

## 查看缺失值的分布情况
par(family = "STKaiti",cex = 0.8)
aggr(train)

# Fill remaining NA values with -1
## 使用 -1 填补剩余的缺失值
train[is.na(train)] <- -1
test[is.na(test)] <- -1

缺失值已经处理完毕

相关性分析

分析那些变量与目标变量的相关性较大,绝对值>0.5

for (col in colnames(train)){
    if(is.numeric(train[,col])){
        if( abs(cor(train[,col],train$SalePrice)) > 0.5){
            print(col)
            print( cor(train[,col],train$SalePrice) )
        }
    }
}
## [1] "OverallQual"
## [1] 0.7909816
## [1] "YearBuilt"
## [1] 0.5228973
## [1] "YearRemodAdd"
## [1] 0.507101
## [1] "TotalBsmtSF"
## [1] 0.6135806
## [1] "X1stFlrSF"
## [1] 0.6058522
## [1] "GrLivArea"
## [1] 0.7086245
## [1] "FullBath"
## [1] 0.5606638
## [1] "TotRmsAbvGrd"
## [1] 0.5337232
## [1] "GarageCars"
## [1] 0.6404092
## [1] "GarageArea"
## [1] 0.6234314
## [1] "SalePrice"
## [1] 1

输出相关系数小于0.1的相关变量

for (col in colnames(train)){
    if(is.numeric(train[,col])){
        if( abs(cor(train[,col],train$SalePrice)) < 0.1){
            print(col)
            print( cor(train[,col],train$SalePrice) )
        }
    }
}
## [1] "Id"
## [1] -0.02191672
## [1] "MSSubClass"
## [1] -0.08428414
## [1] "OverallCond"
## [1] -0.07785589
## [1] "BsmtFinSF2"
## [1] -0.01137812
## [1] "LowQualFinSF"
## [1] -0.02560613
## [1] "BsmtHalfBath"
## [1] -0.01684415
## [1] "X3SsnPorch"
## [1] 0.04458367
## [1] "PoolArea"
## [1] 0.09240355
## [1] "MiscVal"
## [1] -0.02118958
## [1] "MoSold"
## [1] 0.04643225
## [1] "YrSold"
## [1] -0.02892259

相关系数可视化

library(GGally)
ggcorr(train[,-1],label_size = 2) +
  ggtitle("相关系数") +
  theme(plot.title = element_text(hjust = 0.5))

从上面的相关系数图可以看出不同变量之间想关系数的大小情况,来判断两个变量的相关性。

输出相关系数较大的变量

cors <- cor(train[ , sapply(train, is.numeric)])
high_cor <- which(abs(cors) > 0.6 & (abs(cors) < 1))
rows <- rownames(cors)[((high_cor-1) %/% 38)+1]
cols <- colnames(cors)[ifelse(high_cor %% 38 == 0, 38, high_cor %% 38)]
vals <- cors[high_cor]

cor_data <- data.frame(cols=cols, rows=rows, correlation=vals)
cor_data
##            cols         rows correlation
## 1    GarageCars  OverallQual   0.6006707
## 2     SalePrice  OverallQual   0.7909816
## 3  BsmtFullBath   BsmtFinSF1   0.6492118
## 4     X1stFlrSF  TotalBsmtSF   0.8195300
## 5     SalePrice  TotalBsmtSF   0.6135806
## 6   TotalBsmtSF    X1stFlrSF   0.8195300
## 7     SalePrice    X1stFlrSF   0.6058522
## 8     GrLivArea    X2ndFlrSF   0.6875011
## 9      HalfBath    X2ndFlrSF   0.6097073
## 10 TotRmsAbvGrd    X2ndFlrSF   0.6164226
## 11    X2ndFlrSF    GrLivArea   0.6875011
## 12     FullBath    GrLivArea   0.6300116
## 13 TotRmsAbvGrd    GrLivArea   0.8254894
## 14    SalePrice    GrLivArea   0.7086245
## 15   BsmtFinSF1 BsmtFullBath   0.6492118
## 16    GrLivArea     FullBath   0.6300116
## 17    X2ndFlrSF     HalfBath   0.6097073
## 18 TotRmsAbvGrd BedroomAbvGr   0.6766199
## 19    X2ndFlrSF TotRmsAbvGrd   0.6164226
## 20    GrLivArea TotRmsAbvGrd   0.8254894
## 21 BedroomAbvGr TotRmsAbvGrd   0.6766199
## 22  OverallQual   GarageCars   0.6006707
## 23   GarageArea   GarageCars   0.8824754
## 24    SalePrice   GarageCars   0.6404092
## 25   GarageCars   GarageArea   0.8824754
## 26    SalePrice   GarageArea   0.6234314
## 27  OverallQual    SalePrice   0.7909816
## 28  TotalBsmtSF    SalePrice   0.6135806
## 29    X1stFlrSF    SalePrice   0.6058522
## 30    GrLivArea    SalePrice   0.7086245
## 31   GarageCars    SalePrice   0.6404092
## 32   GarageArea    SalePrice   0.6234314

绘制数值变量的密度曲线

ggplot(train,aes(PoolArea))+
      theme(plot.title = element_text(hjust = 0.5)) +
      geom_density(colour = "red",size = 1.5) +
      ggtitle("PoolArea")

for (col in colnames(train)){
  if(is.numeric(train[,col])){
    plot(density(train[,col]), main=col,col = "red",lwd = 2)
  }
}

构建模型

数据准备

# Add variable that combines above grade living area with basement sq footage
## 添加变量
train$total_sq_footage <- train$GrLivArea + train$TotalBsmtSF
test$total_sq_footage <- test$GrLivArea + test$TotalBsmtSF

# Add variable that combines above ground and basement full and half baths
## 
train$total_baths <- train$BsmtFullBath + train$FullBath + (0.5 * (train$BsmtHalfBath + train$HalfBath))
test$total_baths <- test$BsmtFullBath + test$FullBath + (0.5 * (test$BsmtHalfBath + test$HalfBath))

# Remove Id since it should have no value in prediction
## 删除ID
train$Id <- NULL    
test$Id <- NULL

模型准备

# Create custom summary function in proper format for caret
## 创建自定义汇总函数
custom_summary <- function(data, lev = NULL, model = NULL){
    out <- rmsle(data[, "obs"], data[, "pred"])
    names(out) = c("rmsle")
    out
}

# Create control object
## 创建控制对象
control <- trainControl(method = "cv",  # Use cross validation
                        number = 5,     # 5-folds
                        summaryFunction = custom_summary                      
)

# Create grid of tuning parameters
grid <- expand.grid(nrounds=c(100, 200, 400, 800), # Test 4 values for boosting rounds
                    max_depth= c(4, 6),           # Test 2 values for tree depth
                    eta=c(0.1, 0.05, 0.025),      # Test 3 values for learning rate
                    gamma= c(0.1), 
                    colsample_bytree = c(1), 
                    min_child_weight = c(1))

训练模型

set.seed(1)

xgb_tree_model <-  train(SalePrice~.,      # Predict SalePrice using all features
                        data=train,
                        method="xgbTree",
                        trControl=control, 
                        tuneGrid=grid, 
                        metric="rmsle",     # Use custom performance metric
                        maximize = FALSE)   # Minimize the metric
## 模型的结果
xgb_tree_model$results
##      eta max_depth gamma colsample_bytree min_child_weight nrounds
## 1  0.025         4   0.1                1                1     100
## 9  0.050         4   0.1                1                1     100
## 17 0.100         4   0.1                1                1     100
## 5  0.025         6   0.1                1                1     100
## 13 0.050         6   0.1                1                1     100
## 21 0.100         6   0.1                1                1     100
## 2  0.025         4   0.1                1                1     200
## 10 0.050         4   0.1                1                1     200
## 18 0.100         4   0.1                1                1     200
## 6  0.025         6   0.1                1                1     200
## 14 0.050         6   0.1                1                1     200
## 22 0.100         6   0.1                1                1     200
## 3  0.025         4   0.1                1                1     400
## 11 0.050         4   0.1                1                1     400
## 19 0.100         4   0.1                1                1     400
## 7  0.025         6   0.1                1                1     400
## 15 0.050         6   0.1                1                1     400
## 23 0.100         6   0.1                1                1     400
## 4  0.025         4   0.1                1                1     800
## 12 0.050         4   0.1                1                1     800
## 20 0.100         4   0.1                1                1     800
## 8  0.025         6   0.1                1                1     800
## 16 0.050         6   0.1                1                1     800
## 24 0.100         6   0.1                1                1     800
##        rmsle     rmsleSD
## 1  0.1593560 0.009751864
## 9  0.1347126 0.009100853
## 17 0.1311777 0.010217330
## 5  0.1586318 0.013047040
## 13 0.1349454 0.012233283
## 21 0.1326646 0.009695248
## 2  0.1345737 0.008467956
## 10 0.1296781 0.010765366
## 18 0.1298795 0.010658022
## 6  0.1342249 0.012034810
## 14 0.1330244 0.012295185
## 22 0.1320539 0.009820517
## 3  0.1293555 0.009961432
## 11 0.1285337 0.011691981
## 19 0.1291635 0.011648479
## 7  0.1320628 0.012413344
## 15 0.1327252 0.012432386
## 23 0.1320908 0.009829632
## 4  0.1280178 0.010748809
## 12 0.1289578 0.012167940
## 20 0.1292618 0.012151351
## 8  0.1316913 0.012893355
## 16 0.1326135 0.012463334
## 24 0.1321474 0.009750108
xgb_tree_model$bestTune
##   nrounds max_depth   eta gamma colsample_bytree min_child_weight
## 4     800         4 0.025   0.1                1                1
varImp(xgb_tree_model)
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 166)
## 
##                      Overall
## OverallQual         100.0000
## total_sq_footage     89.6357
## total_baths           6.5326
## YearBuilt             5.8619
## LotArea               3.7748
## BsmtFinSF1            3.4032
## GarageCars            3.3694
## X2ndFlrSF             3.1744
## YearRemodAdd          2.8551
## GrLivArea             2.7669
## OverallCond           1.7652
## Fireplaces            1.4815
## GarageArea            1.4109
## OpenPorchSF           1.2184
## KitchenQualTA         0.9510
## TotalBsmtSF           0.9498
## LotFrontage           0.8790
## BsmtUnfSF             0.8397
## NeighborhoodEdwards   0.7602
## KitchenAbvGr          0.6890
summary(xgb_tree_model)
##             Length Class              Mode       
## handle           1 xgb.Booster.handle externalptr
## raw         821408 -none-             raw        
## xNames         270 -none-             character  
## problemType      1 -none-             character  
## tuneValue        6 data.frame         list       
## obsLevels        1 -none-             logical