参考来源:http://hamelg.blogspot.jp/2016/09/kaggle-home-price-prediction-tutorial.html?view=magazine
train <- read.csv("train.csv")
test <- read.csv("test.csv")
dim(train)## [1] 1460 81
dim(test)## [1] 1459 80
str(train)## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
unique(train$MiscFeature)## [1] NA "Shed" "Gar2" "Othr" "TenC"
unique(test$MiscFeature)## [1] NA "Gar2" "Shed" "Othr"
从上面的结果可以看出,训练集和测试集某些特征中所含有的字符串并不相同,针对这一情况,需要处理,即在定义因子变量的时候,需要将训练集和测试集他的相同特征同时定义为因子变量,来保证它们的水平个数相同。
# 将训练集中的目标变量剔除
SalePrice <- train$SalePrice
train$SalePrice <- NULL
# Combine data sets
## 组合训练集和测试集
full_data = rbind(train,test)
# Convert character columns to factor, filling NA values with "missing"
## 将字符串转化为因子变量,并且使用“missing”填充缺失值
for(col in colnames(full_data)){
## 如果变量为字符串就转化为因子变量
if (typeof(full_data[,col]) == "character"){
new_col <- full_data[,col]
new_col[is.na(new_col)] <- "missing"
full_data[col] <- as.factor(new_col)
}
}
# Separate out our train and test sets
## 切分我们的训练集和测试集
train <- full_data[1:nrow(train),]
train$SalePrice <- SalePrice
test <- full_data[(nrow(train)+1):nrow(full_data),]
summary(train)## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 missing: 0 Median : 69.00
## Mean : 730.5 Mean : 56.9 RH : 16 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RL :1151 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 RM : 218 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl : 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 missing:1369 IR2: 41 HLS: 50
## Median : 9478 Pave : 41 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub :1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## missing: 0 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## NoSeWa : 1 FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual
## Norm :1445 1Fam :1220 1Story :726 Min. : 1.000
## Feedr : 6 2fmCon: 31 2Story :445 1st Qu.: 5.000
## Artery : 2 Duplex: 52 1.5Fin :154 Median : 6.000
## PosN : 2 Twnhs : 43 SLvl : 65 Mean : 6.099
## RRNn : 2 TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000
## PosA : 1 1.5Unf : 14 Max. :10.000
## (Other): 2 (Other): 19
## OverallCond YearBuilt YearRemodAdd RoofStyle
## Min. :1.000 Min. :1872 Min. :1950 Flat : 13
## 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967 Gable :1141
## Median :5.000 Median :1973 Median :1994 Gambrel: 11
## Mean :5.575 Mean :1971 Mean :1985 Hip : 286
## 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7
## Max. :9.000 Max. :2010 Max. :2010 Shed : 2
##
## RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea
## CompShg:1434 VinylSd:515 VinylSd:504 BrkCmn : 15 Min. : 0.0
## Tar&Grv: 11 HdBoard:222 MetalSd:214 BrkFace:445 1st Qu.: 0.0
## WdShngl: 6 MetalSd:220 HdBoard:207 missing: 8 Median : 0.0
## WdShake: 5 Wd Sdng:206 Wd Sdng:197 None :864 Mean : 103.7
## ClyTile: 1 Plywood:108 Plywood:142 Stone :128 3rd Qu.: 166.0
## Membran: 1 CemntBd: 61 CmentBd: 60 Max. :1600.0
## (Other): 2 (Other):128 (Other):136 NA's :8
## ExterQual ExterCond Foundation BsmtQual BsmtCond
## Ex: 52 Ex: 3 BrkTil:146 Ex :121 Fa : 45
## Fa: 14 Fa: 28 CBlock:634 Fa : 35 Gd : 65
## Gd:488 Gd: 146 PConc :647 Gd :618 missing: 37
## TA:906 Po: 1 Slab : 24 missing: 37 Po : 2
## TA:1282 Stone : 6 TA :649 TA :1311
## Wood : 3
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Av :221 ALQ :220 Min. : 0.0 ALQ : 19
## Gd :134 BLQ :148 1st Qu.: 0.0 BLQ : 33
## missing: 38 GLQ :418 Median : 383.5 GLQ : 14
## Mn :114 LwQ : 74 Mean : 443.6 LwQ : 46
## No :953 missing: 37 3rd Qu.: 712.2 missing: 38
## Rec :133 Max. :5644.0 Rec : 54
## Unf :430 Unf :1256
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Floor: 1
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428
## Median : 0.00 Median : 477.5 Median : 991.5 GasW : 18
## Mean : 46.55 Mean : 567.2 Mean :1057.4 Grav : 7
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2
## Max. :1474.00 Max. :2336.0 Max. :6110.0 Wall : 4
##
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## Ex:741 N: 95 FuseA : 94 Min. : 334 Min. : 0
## Fa: 49 Y:1365 FuseF : 27 1st Qu.: 882 1st Qu.: 0
## Gd:241 FuseP : 3 Median :1087 Median : 0
## Po: 1 missing: 1 Mean :1163 Mean : 347
## TA:428 Mix : 1 3rd Qu.:1391 3rd Qu.: 728
## SBrkr :1334 Max. :4692 Max. :2065
##
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## Min. : 0.000 Min. : 334 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 0.000 Median :1464 Median :0.0000 Median :0.00000
## Mean : 5.845 Mean :1515 Mean :0.4253 Mean :0.05753
## 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :572.000 Max. :5642 Max. :3.0000 Max. :2.00000
##
## FullBath HalfBath BedroomAbvGr KitchenAbvGr
## Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000
## Median :2.000 Median :0.0000 Median :3.000 Median :1.000
## Mean :1.565 Mean :0.3829 Mean :2.866 Mean :1.047
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :3.000 Max. :2.0000 Max. :8.000 Max. :3.000
##
## KitchenQual TotRmsAbvGrd Functional Fireplaces
## Ex :100 Min. : 2.000 Typ :1360 Min. :0.000
## Fa : 39 1st Qu.: 5.000 Min2 : 34 1st Qu.:0.000
## Gd :586 Median : 6.000 Min1 : 31 Median :1.000
## missing: 0 Mean : 6.518 Mod : 15 Mean :0.613
## TA :735 3rd Qu.: 7.000 Maj1 : 14 3rd Qu.:1.000
## Max. :14.000 Maj2 : 5 Max. :3.000
## (Other): 1
## FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## Ex : 24 2Types : 6 Min. :1900 Fin :352 Min. :0.000
## Fa : 33 Attchd :870 1st Qu.:1961 missing: 81 1st Qu.:1.000
## Gd :380 Basment: 19 Median :1980 RFn :422 Median :2.000
## missing:690 BuiltIn: 88 Mean :1979 Unf :605 Mean :1.767
## Po : 20 CarPort: 9 3rd Qu.:2002 3rd Qu.:2.000
## TA :313 Detchd :387 Max. :2010 Max. :4.000
## missing: 81 NA's :81
## GarageArea GarageQual GarageCond PavedDrive
## Min. : 0.0 Ex : 3 Ex : 2 N: 90
## 1st Qu.: 334.5 Fa : 48 Fa : 35 P: 30
## Median : 480.0 Gd : 14 Gd : 9 Y:1340
## Mean : 473.0 missing: 81 missing: 81
## 3rd Qu.: 576.0 Po : 3 Po : 7
## Max. :1418.0 TA :1311 TA :1326
##
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv : 59
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54
## Median : 0.00 Median : 0.000 Gd : 3 missing:1179
## Mean : 15.06 Mean : 2.759 missing:1453 MnPrv : 157
## 3rd Qu.: 0.00 3rd Qu.: 0.000 MnWw : 11
## Max. :480.00 Max. :738.000
##
## MiscFeature MiscVal MoSold YrSold
## Gar2 : 2 Min. : 0.00 Min. : 1.000 Min. :2006
## missing:1406 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007
## Othr : 2 Median : 0.00 Median : 6.000 Median :2008
## Shed : 49 Mean : 43.49 Mean : 6.322 Mean :2008
## TenC : 1 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :15500.00 Max. :12.000 Max. :2010
##
## SaleType SaleCondition SalePrice
## WD :1267 Abnorml: 101 Min. : 34900
## New : 122 AdjLand: 4 1st Qu.:129975
## COD : 43 Alloca : 12 Median :163000
## ConLD : 9 Family : 20 Mean :180921
## ConLI : 5 Normal :1198 3rd Qu.:214000
## ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
## 查看缺失值的分布情况
par(family = "STKaiti",cex = 0.8)
aggr(train)# Fill remaining NA values with -1
## 使用 -1 填补剩余的缺失值
train[is.na(train)] <- -1
test[is.na(test)] <- -1缺失值已经处理完毕
分析那些变量与目标变量的相关性较大,绝对值>0.5
for (col in colnames(train)){
if(is.numeric(train[,col])){
if( abs(cor(train[,col],train$SalePrice)) > 0.5){
print(col)
print( cor(train[,col],train$SalePrice) )
}
}
}## [1] "OverallQual"
## [1] 0.7909816
## [1] "YearBuilt"
## [1] 0.5228973
## [1] "YearRemodAdd"
## [1] 0.507101
## [1] "TotalBsmtSF"
## [1] 0.6135806
## [1] "X1stFlrSF"
## [1] 0.6058522
## [1] "GrLivArea"
## [1] 0.7086245
## [1] "FullBath"
## [1] 0.5606638
## [1] "TotRmsAbvGrd"
## [1] 0.5337232
## [1] "GarageCars"
## [1] 0.6404092
## [1] "GarageArea"
## [1] 0.6234314
## [1] "SalePrice"
## [1] 1
输出相关系数小于0.1的相关变量
for (col in colnames(train)){
if(is.numeric(train[,col])){
if( abs(cor(train[,col],train$SalePrice)) < 0.1){
print(col)
print( cor(train[,col],train$SalePrice) )
}
}
}## [1] "Id"
## [1] -0.02191672
## [1] "MSSubClass"
## [1] -0.08428414
## [1] "OverallCond"
## [1] -0.07785589
## [1] "BsmtFinSF2"
## [1] -0.01137812
## [1] "LowQualFinSF"
## [1] -0.02560613
## [1] "BsmtHalfBath"
## [1] -0.01684415
## [1] "X3SsnPorch"
## [1] 0.04458367
## [1] "PoolArea"
## [1] 0.09240355
## [1] "MiscVal"
## [1] -0.02118958
## [1] "MoSold"
## [1] 0.04643225
## [1] "YrSold"
## [1] -0.02892259
相关系数可视化
library(GGally)
ggcorr(train[,-1],label_size = 2) +
ggtitle("相关系数") +
theme(plot.title = element_text(hjust = 0.5)) 从上面的相关系数图可以看出不同变量之间想关系数的大小情况,来判断两个变量的相关性。
cors <- cor(train[ , sapply(train, is.numeric)])
high_cor <- which(abs(cors) > 0.6 & (abs(cors) < 1))
rows <- rownames(cors)[((high_cor-1) %/% 38)+1]
cols <- colnames(cors)[ifelse(high_cor %% 38 == 0, 38, high_cor %% 38)]
vals <- cors[high_cor]
cor_data <- data.frame(cols=cols, rows=rows, correlation=vals)
cor_data## cols rows correlation
## 1 GarageCars OverallQual 0.6006707
## 2 SalePrice OverallQual 0.7909816
## 3 BsmtFullBath BsmtFinSF1 0.6492118
## 4 X1stFlrSF TotalBsmtSF 0.8195300
## 5 SalePrice TotalBsmtSF 0.6135806
## 6 TotalBsmtSF X1stFlrSF 0.8195300
## 7 SalePrice X1stFlrSF 0.6058522
## 8 GrLivArea X2ndFlrSF 0.6875011
## 9 HalfBath X2ndFlrSF 0.6097073
## 10 TotRmsAbvGrd X2ndFlrSF 0.6164226
## 11 X2ndFlrSF GrLivArea 0.6875011
## 12 FullBath GrLivArea 0.6300116
## 13 TotRmsAbvGrd GrLivArea 0.8254894
## 14 SalePrice GrLivArea 0.7086245
## 15 BsmtFinSF1 BsmtFullBath 0.6492118
## 16 GrLivArea FullBath 0.6300116
## 17 X2ndFlrSF HalfBath 0.6097073
## 18 TotRmsAbvGrd BedroomAbvGr 0.6766199
## 19 X2ndFlrSF TotRmsAbvGrd 0.6164226
## 20 GrLivArea TotRmsAbvGrd 0.8254894
## 21 BedroomAbvGr TotRmsAbvGrd 0.6766199
## 22 OverallQual GarageCars 0.6006707
## 23 GarageArea GarageCars 0.8824754
## 24 SalePrice GarageCars 0.6404092
## 25 GarageCars GarageArea 0.8824754
## 26 SalePrice GarageArea 0.6234314
## 27 OverallQual SalePrice 0.7909816
## 28 TotalBsmtSF SalePrice 0.6135806
## 29 X1stFlrSF SalePrice 0.6058522
## 30 GrLivArea SalePrice 0.7086245
## 31 GarageCars SalePrice 0.6404092
## 32 GarageArea SalePrice 0.6234314
ggplot(train,aes(PoolArea))+
theme(plot.title = element_text(hjust = 0.5)) +
geom_density(colour = "red",size = 1.5) +
ggtitle("PoolArea")for (col in colnames(train)){
if(is.numeric(train[,col])){
plot(density(train[,col]), main=col,col = "red",lwd = 2)
}
}# Add variable that combines above grade living area with basement sq footage
## 添加变量
train$total_sq_footage <- train$GrLivArea + train$TotalBsmtSF
test$total_sq_footage <- test$GrLivArea + test$TotalBsmtSF
# Add variable that combines above ground and basement full and half baths
##
train$total_baths <- train$BsmtFullBath + train$FullBath + (0.5 * (train$BsmtHalfBath + train$HalfBath))
test$total_baths <- test$BsmtFullBath + test$FullBath + (0.5 * (test$BsmtHalfBath + test$HalfBath))
# Remove Id since it should have no value in prediction
## 删除ID
train$Id <- NULL
test$Id <- NULL# Create custom summary function in proper format for caret
## 创建自定义汇总函数
custom_summary <- function(data, lev = NULL, model = NULL){
out <- rmsle(data[, "obs"], data[, "pred"])
names(out) = c("rmsle")
out
}
# Create control object
## 创建控制对象
control <- trainControl(method = "cv", # Use cross validation
number = 5, # 5-folds
summaryFunction = custom_summary
)
# Create grid of tuning parameters
grid <- expand.grid(nrounds=c(100, 200, 400, 800), # Test 4 values for boosting rounds
max_depth= c(4, 6), # Test 2 values for tree depth
eta=c(0.1, 0.05, 0.025), # Test 3 values for learning rate
gamma= c(0.1),
colsample_bytree = c(1),
min_child_weight = c(1))set.seed(1)
xgb_tree_model <- train(SalePrice~., # Predict SalePrice using all features
data=train,
method="xgbTree",
trControl=control,
tuneGrid=grid,
metric="rmsle", # Use custom performance metric
maximize = FALSE) # Minimize the metric## 模型的结果
xgb_tree_model$results## eta max_depth gamma colsample_bytree min_child_weight nrounds
## 1 0.025 4 0.1 1 1 100
## 9 0.050 4 0.1 1 1 100
## 17 0.100 4 0.1 1 1 100
## 5 0.025 6 0.1 1 1 100
## 13 0.050 6 0.1 1 1 100
## 21 0.100 6 0.1 1 1 100
## 2 0.025 4 0.1 1 1 200
## 10 0.050 4 0.1 1 1 200
## 18 0.100 4 0.1 1 1 200
## 6 0.025 6 0.1 1 1 200
## 14 0.050 6 0.1 1 1 200
## 22 0.100 6 0.1 1 1 200
## 3 0.025 4 0.1 1 1 400
## 11 0.050 4 0.1 1 1 400
## 19 0.100 4 0.1 1 1 400
## 7 0.025 6 0.1 1 1 400
## 15 0.050 6 0.1 1 1 400
## 23 0.100 6 0.1 1 1 400
## 4 0.025 4 0.1 1 1 800
## 12 0.050 4 0.1 1 1 800
## 20 0.100 4 0.1 1 1 800
## 8 0.025 6 0.1 1 1 800
## 16 0.050 6 0.1 1 1 800
## 24 0.100 6 0.1 1 1 800
## rmsle rmsleSD
## 1 0.1593560 0.009751864
## 9 0.1347126 0.009100853
## 17 0.1311777 0.010217330
## 5 0.1586318 0.013047040
## 13 0.1349454 0.012233283
## 21 0.1326646 0.009695248
## 2 0.1345737 0.008467956
## 10 0.1296781 0.010765366
## 18 0.1298795 0.010658022
## 6 0.1342249 0.012034810
## 14 0.1330244 0.012295185
## 22 0.1320539 0.009820517
## 3 0.1293555 0.009961432
## 11 0.1285337 0.011691981
## 19 0.1291635 0.011648479
## 7 0.1320628 0.012413344
## 15 0.1327252 0.012432386
## 23 0.1320908 0.009829632
## 4 0.1280178 0.010748809
## 12 0.1289578 0.012167940
## 20 0.1292618 0.012151351
## 8 0.1316913 0.012893355
## 16 0.1326135 0.012463334
## 24 0.1321474 0.009750108
xgb_tree_model$bestTune## nrounds max_depth eta gamma colsample_bytree min_child_weight
## 4 800 4 0.025 0.1 1 1
varImp(xgb_tree_model)## xgbTree variable importance
##
## only 20 most important variables shown (out of 166)
##
## Overall
## OverallQual 100.0000
## total_sq_footage 89.6357
## total_baths 6.5326
## YearBuilt 5.8619
## LotArea 3.7748
## BsmtFinSF1 3.4032
## GarageCars 3.3694
## X2ndFlrSF 3.1744
## YearRemodAdd 2.8551
## GrLivArea 2.7669
## OverallCond 1.7652
## Fireplaces 1.4815
## GarageArea 1.4109
## OpenPorchSF 1.2184
## KitchenQualTA 0.9510
## TotalBsmtSF 0.9498
## LotFrontage 0.8790
## BsmtUnfSF 0.8397
## NeighborhoodEdwards 0.7602
## KitchenAbvGr 0.6890
summary(xgb_tree_model)## Length Class Mode
## handle 1 xgb.Booster.handle externalptr
## raw 821408 -none- raw
## xNames 270 -none- character
## problemType 1 -none- character
## tuneValue 6 data.frame list
## obsLevels 1 -none- logical