library(readr)
library(dplyr)
library(ggplot2)
library(reshape) # Para renombrar columnas
# Los datos de entrenamiento
entrena <- read_csv("../datos/house-prices-advanced-regression-techniques/train.csv")
# Los datos de validación
valida <- read.csv("../datos/house-prices-advanced-regression-techniques/test.csv")
# Ver los primeros registros de cada conjunto de datos
head(entrena)
## # A tibble: 6 x 81
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## # … with 73 more variables: LandContour <chr>, Utilities <chr>,
## # LotConfig <chr>, LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>,
## # Condition2 <chr>, BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>,
## # OverallCond <dbl>, YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>,
## # RoofMatl <chr>, Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>,
## # MasVnrArea <dbl>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## # BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>,
## # BsmtFinSF1 <dbl>, BsmtFinType2 <chr>, BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>,
## # TotalBsmtSF <dbl>, Heating <chr>, HeatingQC <chr>, CentralAir <chr>,
## # Electrical <chr>, `1stFlrSF` <dbl>, `2ndFlrSF` <dbl>, LowQualFinSF <dbl>,
## # GrLivArea <dbl>, BsmtFullBath <dbl>, BsmtHalfBath <dbl>, FullBath <dbl>,
## # HalfBath <dbl>, BedroomAbvGr <dbl>, KitchenAbvGr <dbl>, KitchenQual <chr>,
## # TotRmsAbvGrd <dbl>, Functional <chr>, Fireplaces <dbl>, FireplaceQu <chr>,
## # GarageType <chr>, GarageYrBlt <dbl>, GarageFinish <chr>, GarageCars <dbl>,
## # GarageArea <dbl>, GarageQual <chr>, GarageCond <chr>, PavedDrive <chr>,
## # WoodDeckSF <dbl>, OpenPorchSF <dbl>, EnclosedPorch <dbl>,
## # `3SsnPorch` <dbl>, ScreenPorch <dbl>, PoolArea <dbl>, PoolQC <chr>,
## # Fence <chr>, MiscFeature <chr>, MiscVal <dbl>, MoSold <dbl>, YrSold <dbl>,
## # SaleType <chr>, SaleCondition <chr>, SalePrice <dbl>
head(valida)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461 20 RH 80 11622 Pave <NA> Reg
## 2 1462 20 RL 81 14267 Pave <NA> IR1
## 3 1463 60 RL 74 13830 Pave <NA> IR1
## 4 1464 60 RL 78 9978 Pave <NA> IR1
## 5 1465 120 RL 43 5005 Pave <NA> IR1
## 6 1466 60 RL 75 10000 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2
## 1 Lvl AllPub Inside Gtl NAmes Feedr Norm
## 2 Lvl AllPub Corner Gtl NAmes Norm Norm
## 3 Lvl AllPub Inside Gtl Gilbert Norm Norm
## 4 Lvl AllPub Inside Gtl Gilbert Norm Norm
## 5 HLS AllPub Inside Gtl StoneBr Norm Norm
## 6 Lvl AllPub Corner Gtl Gilbert Norm Norm
## BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle
## 1 1Fam 1Story 5 6 1961 1961 Gable
## 2 1Fam 1Story 6 6 1958 1958 Hip
## 3 1Fam 2Story 5 5 1997 1998 Gable
## 4 1Fam 2Story 6 6 1998 1998 Gable
## 5 TwnhsE 1Story 8 5 1992 1992 Gable
## 6 1Fam 2Story 6 5 1993 1994 Gable
## RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## 1 CompShg VinylSd VinylSd None 0 TA TA
## 2 CompShg Wd Sdng Wd Sdng BrkFace 108 TA TA
## 3 CompShg VinylSd VinylSd None 0 TA TA
## 4 CompShg VinylSd VinylSd BrkFace 20 TA TA
## 5 CompShg HdBoard HdBoard None 0 Gd TA
## 6 CompShg HdBoard HdBoard None 0 TA TA
## Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 1 CBlock TA TA No Rec 468
## 2 CBlock TA TA No ALQ 923
## 3 PConc Gd TA No GLQ 791
## 4 PConc TA TA No GLQ 602
## 5 PConc Gd TA No ALQ 263
## 6 PConc Gd TA No Unf 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## 1 LwQ 144 270 882 GasA TA Y
## 2 Unf 0 406 1329 GasA TA Y
## 3 Unf 0 137 928 GasA Gd Y
## 4 Unf 0 324 926 GasA Ex Y
## 5 Unf 0 1017 1280 GasA Ex Y
## 6 Unf 0 763 763 GasA Gd Y
## Electrical X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## 1 SBrkr 896 0 0 896 0
## 2 SBrkr 1329 0 0 1329 0
## 3 SBrkr 928 701 0 1629 0
## 4 SBrkr 926 678 0 1604 0
## 5 SBrkr 1280 0 0 1280 0
## 6 SBrkr 763 892 0 1655 0
## BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## 1 0 1 0 2 1 TA
## 2 0 1 1 3 1 Gd
## 3 0 2 1 3 1 TA
## 4 0 2 1 3 1 Gd
## 5 0 2 0 2 1 Gd
## 6 0 2 1 3 1 TA
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 1 5 Typ 0 <NA> Attchd 1961
## 2 6 Typ 0 <NA> Attchd 1958
## 3 6 Typ 1 TA Attchd 1997
## 4 7 Typ 1 Gd Attchd 1998
## 5 5 Typ 0 <NA> Attchd 1992
## 6 7 Typ 1 TA Attchd 1993
## GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive
## 1 Unf 1 730 TA TA Y
## 2 Unf 1 312 TA TA Y
## 3 Fin 2 482 TA TA Y
## 4 Fin 2 470 TA TA Y
## 5 RFn 2 506 TA TA Y
## 6 Fin 2 440 TA TA Y
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 1 140 0 0 0 120 0 <NA>
## 2 393 36 0 0 0 0 <NA>
## 3 212 34 0 0 0 0 <NA>
## 4 360 36 0 0 0 0 <NA>
## 5 0 82 0 0 144 0 <NA>
## 6 157 84 0 0 0 0 <NA>
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 1 MnPrv <NA> 0 6 2010 WD Normal
## 2 <NA> Gar2 12500 6 2010 WD Normal
## 3 MnPrv <NA> 0 3 2010 WD Normal
## 4 <NA> <NA> 0 6 2010 WD Normal
## 5 <NA> <NA> 0 1 2010 WD Normal
## 6 <NA> <NA> 0 4 2010 WD Normal
str(entrena)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1460 obs. of 81 variables:
## $ Id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : num 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : num 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : num 8450 9600 11250 9550 14260 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : num 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : num 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd : num 2003 1976 2002 1970 2000 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : num 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : num 706 978 486 216 655 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : num 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : num 856 1262 920 756 1145 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : num 856 1262 920 961 1145 ...
## $ 2ndFlrSF : num 854 0 866 756 1053 ...
## $ LowQualFinSF : num 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num 1710 1262 1786 1717 2198 ...
## $ BsmtFullBath : num 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : num 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : num 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : num 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : num 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : num 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : num 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : num 2003 1976 2001 1998 2000 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : num 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : num 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : num 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: num 0 0 0 272 0 0 0 228 205 0 ...
## $ 3SsnPorch : num 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : num 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : num 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : num 2008 2007 2008 2006 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num 208500 181500 223500 140000 250000 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotFrontage = col_double(),
## .. LotArea = col_double(),
## .. Street = col_character(),
## .. Alley = col_character(),
## .. LotShape = col_character(),
## .. LandContour = col_character(),
## .. Utilities = col_character(),
## .. LotConfig = col_character(),
## .. LandSlope = col_character(),
## .. Neighborhood = col_character(),
## .. Condition1 = col_character(),
## .. Condition2 = col_character(),
## .. BldgType = col_character(),
## .. HouseStyle = col_character(),
## .. OverallQual = col_double(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. RoofStyle = col_character(),
## .. RoofMatl = col_character(),
## .. Exterior1st = col_character(),
## .. Exterior2nd = col_character(),
## .. MasVnrType = col_character(),
## .. MasVnrArea = col_double(),
## .. ExterQual = col_character(),
## .. ExterCond = col_character(),
## .. Foundation = col_character(),
## .. BsmtQual = col_character(),
## .. BsmtCond = col_character(),
## .. BsmtExposure = col_character(),
## .. BsmtFinType1 = col_character(),
## .. BsmtFinSF1 = col_double(),
## .. BsmtFinType2 = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. BsmtUnfSF = col_double(),
## .. TotalBsmtSF = col_double(),
## .. Heating = col_character(),
## .. HeatingQC = col_character(),
## .. CentralAir = col_character(),
## .. Electrical = col_character(),
## .. `1stFlrSF` = col_double(),
## .. `2ndFlrSF` = col_double(),
## .. LowQualFinSF = col_double(),
## .. GrLivArea = col_double(),
## .. BsmtFullBath = col_double(),
## .. BsmtHalfBath = col_double(),
## .. FullBath = col_double(),
## .. HalfBath = col_double(),
## .. BedroomAbvGr = col_double(),
## .. KitchenAbvGr = col_double(),
## .. KitchenQual = col_character(),
## .. TotRmsAbvGrd = col_double(),
## .. Functional = col_character(),
## .. Fireplaces = col_double(),
## .. FireplaceQu = col_character(),
## .. GarageType = col_character(),
## .. GarageYrBlt = col_double(),
## .. GarageFinish = col_character(),
## .. GarageCars = col_double(),
## .. GarageArea = col_double(),
## .. GarageQual = col_character(),
## .. GarageCond = col_character(),
## .. PavedDrive = col_character(),
## .. WoodDeckSF = col_double(),
## .. OpenPorchSF = col_double(),
## .. EnclosedPorch = col_double(),
## .. `3SsnPorch` = col_double(),
## .. ScreenPorch = col_double(),
## .. PoolArea = col_double(),
## .. PoolQC = col_character(),
## .. Fence = col_character(),
## .. MiscFeature = col_character(),
## .. MiscVal = col_double(),
## .. MoSold = col_double(),
## .. YrSold = col_double(),
## .. SaleType = col_character(),
## .. SaleCondition = col_character(),
## .. SalePrice = col_double()
## .. )
summary(entrena)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
nrow(entrena) # Registros
## [1] 1460
ncol(entrena) # Columnas
## [1] 81
# Se visualiza como una distribución normal
ggplot(entrena, aes(x=SalePrice)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sapply(entrena, is.numeric) # Cuáles son ?
## Id MSSubClass MSZoning LotFrontage LotArea
## TRUE TRUE FALSE TRUE TRUE
## Street Alley LotShape LandContour Utilities
## FALSE FALSE FALSE FALSE FALSE
## LotConfig LandSlope Neighborhood Condition1 Condition2
## FALSE FALSE FALSE FALSE FALSE
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## FALSE FALSE TRUE TRUE TRUE
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## TRUE FALSE FALSE FALSE FALSE
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## FALSE TRUE FALSE FALSE FALSE
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## FALSE FALSE FALSE FALSE TRUE
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## FALSE TRUE TRUE TRUE FALSE
## HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF
## FALSE FALSE FALSE TRUE TRUE
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## TRUE TRUE TRUE TRUE TRUE
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## TRUE TRUE TRUE FALSE TRUE
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## FALSE TRUE FALSE FALSE TRUE
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## FALSE TRUE TRUE FALSE FALSE
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## FALSE TRUE TRUE TRUE TRUE
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## TRUE TRUE FALSE FALSE FALSE
## MiscVal MoSold YrSold SaleType SaleCondition
## TRUE TRUE TRUE FALSE FALSE
## SalePrice
## TRUE
cualesNumericas <- which(sapply(entrena, is.numeric))
nombresColumnas <- names(cualesNumericas)
nombresColumnas
## [1] "Id" "MSSubClass" "LotFrontage" "LotArea"
## [5] "OverallQual" "OverallCond" "YearBuilt" "YearRemodAdd"
## [9] "MasVnrArea" "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF"
## [13] "TotalBsmtSF" "1stFlrSF" "2ndFlrSF" "LowQualFinSF"
## [17] "GrLivArea" "BsmtFullBath" "BsmtHalfBath" "FullBath"
## [21] "HalfBath" "BedroomAbvGr" "KitchenAbvGr" "TotRmsAbvGrd"
## [25] "Fireplaces" "GarageYrBlt" "GarageCars" "GarageArea"
## [29] "WoodDeckSF" "OpenPorchSF" "EnclosedPorch" "3SsnPorch"
## [33] "ScreenPorch" "PoolArea" "MiscVal" "MoSold"
## [37] "YrSold" "SalePrice"
# Ver el diccionario de datos ???
# Conjunto
entrenaNumericas <- select(entrena, nombresColumnas)
entrenaNumericas
## # A tibble: 1,460 x 38
## Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 60 65 8450 7 5 2003
## 2 2 20 80 9600 6 8 1976
## 3 3 60 68 11250 7 5 2001
## 4 4 70 60 9550 7 5 1915
## 5 5 60 84 14260 8 5 2000
## 6 6 50 85 14115 5 5 1993
## 7 7 20 75 10084 8 5 2004
## 8 8 60 NA 10382 7 6 1973
## 9 9 50 51 6120 7 5 1931
## 10 10 190 50 7420 5 6 1939
## # … with 1,450 more rows, and 31 more variables: YearRemodAdd <dbl>,
## # MasVnrArea <dbl>, BsmtFinSF1 <dbl>, BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>,
## # TotalBsmtSF <dbl>, `1stFlrSF` <dbl>, `2ndFlrSF` <dbl>, LowQualFinSF <dbl>,
## # GrLivArea <dbl>, BsmtFullBath <dbl>, BsmtHalfBath <dbl>, FullBath <dbl>,
## # HalfBath <dbl>, BedroomAbvGr <dbl>, KitchenAbvGr <dbl>, TotRmsAbvGrd <dbl>,
## # Fireplaces <dbl>, GarageYrBlt <dbl>, GarageCars <dbl>, GarageArea <dbl>,
## # WoodDeckSF <dbl>, OpenPorchSF <dbl>, EnclosedPorch <dbl>,
## # `3SsnPorch` <dbl>, ScreenPorch <dbl>, PoolArea <dbl>, MiscVal <dbl>,
## # MoSold <dbl>, YrSold <dbl>, SalePrice <dbl>
correlaciones <- data.frame(cor(entrenaNumericas))
entrenaCol.Correl.Price <- data.frame(cbind("variable" = rownames(correlaciones),"correlacion"= correlaciones$SalePrice))
entrenaCol.Correl.Price
## variable correlacion
## 1 Id -0.0219167194434311
## 2 MSSubClass -0.0842841351265952
## 3 LotFrontage <NA>
## 4 LotArea 0.263843353871406
## 5 OverallQual 0.790981600583805
## 6 OverallCond -0.077855894048678
## 7 YearBuilt 0.522897332879497
## 8 YearRemodAdd 0.507100967111386
## 9 MasVnrArea <NA>
## 10 BsmtFinSF1 0.386419806242153
## 11 BsmtFinSF2 -0.0113781214502151
## 12 BsmtUnfSF 0.214479105546969
## 13 TotalBsmtSF 0.613580551559196
## 14 1stFlrSF 0.605852184691915
## 15 2ndFlrSF 0.319333802832068
## 16 LowQualFinSF -0.0256061300006795
## 17 GrLivArea 0.708624477612652
## 18 BsmtFullBath 0.227122233131494
## 19 BsmtHalfBath -0.016844154297359
## 20 FullBath 0.560663762748446
## 21 HalfBath 0.284107675594783
## 22 BedroomAbvGr 0.16821315430074
## 23 KitchenAbvGr -0.135907370842141
## 24 TotRmsAbvGrd 0.533723155582028
## 25 Fireplaces 0.466928836751528
## 26 GarageYrBlt <NA>
## 27 GarageCars 0.640409197258352
## 28 GarageArea 0.623431438918362
## 29 WoodDeckSF 0.32441344456813
## 30 OpenPorchSF 0.315856227116055
## 31 EnclosedPorch -0.128577957925957
## 32 3SsnPorch 0.0445836653357484
## 33 ScreenPorch 0.111446571142911
## 34 PoolArea 0.0924035494918732
## 35 MiscVal -0.0211895796403033
## 36 MoSold 0.0464322452238193
## 37 YrSold -0.0289225851687303
## 38 SalePrice 1
entrenaCol.Correl.Price <- arrange(entrenaCol.Correl.Price, desc(correlacion))
entrenaCol.Correl.Price
## variable correlacion
## 1 SalePrice 1
## 2 OverallQual 0.790981600583805
## 3 GrLivArea 0.708624477612652
## 4 GarageCars 0.640409197258352
## 5 GarageArea 0.623431438918362
## 6 TotalBsmtSF 0.613580551559196
## 7 1stFlrSF 0.605852184691915
## 8 FullBath 0.560663762748446
## 9 TotRmsAbvGrd 0.533723155582028
## 10 YearBuilt 0.522897332879497
## 11 YearRemodAdd 0.507100967111386
## 12 Fireplaces 0.466928836751528
## 13 BsmtFinSF1 0.386419806242153
## 14 WoodDeckSF 0.32441344456813
## 15 2ndFlrSF 0.319333802832068
## 16 OpenPorchSF 0.315856227116055
## 17 HalfBath 0.284107675594783
## 18 LotArea 0.263843353871406
## 19 BsmtFullBath 0.227122233131494
## 20 BsmtUnfSF 0.214479105546969
## 21 BedroomAbvGr 0.16821315430074
## 22 ScreenPorch 0.111446571142911
## 23 PoolArea 0.0924035494918732
## 24 MoSold 0.0464322452238193
## 25 3SsnPorch 0.0445836653357484
## 26 KitchenAbvGr -0.135907370842141
## 27 EnclosedPorch -0.128577957925957
## 28 MSSubClass -0.0842841351265952
## 29 OverallCond -0.077855894048678
## 30 YrSold -0.0289225851687303
## 31 LowQualFinSF -0.0256061300006795
## 32 Id -0.0219167194434311
## 33 MiscVal -0.0211895796403033
## 34 BsmtHalfBath -0.016844154297359
## 35 BsmtFinSF2 -0.0113781214502151
## 36 LotFrontage <NA>
## 37 MasVnrArea <NA>
## 38 GarageYrBlt <NA>
head(entrenaCol.Correl.Price)
## variable correlacion
## 1 SalePrice 1
## 2 OverallQual 0.790981600583805
## 3 GrLivArea 0.708624477612652
## 4 GarageCars 0.640409197258352
## 5 GarageArea 0.623431438918362
## 6 TotalBsmtSF 0.613580551559196
ggplot(entrenaNumericas, aes(x=OverallQual, y=SalePrice)) +
geom_point(color="darkred") + geom_smooth(method = "lm")
ggplot(entrenaNumericas, aes(x=GrLivArea, y=SalePrice)) +
geom_point(color="darkgreen") + geom_smooth(method = "lm")
ggplot(entrena, aes(x=GarageCars, y=SalePrice)) +
geom_point(color="red") + geom_smooth(method = "lm")
ggplot(entrenaNumericas, aes(x=GarageArea, y=SalePrice)) +
geom_point(color="red") + geom_smooth(method = "lm")
ggplot(entrenaNumericas, aes(x=TotalBsmtSF, y=SalePrice)) +
geom_point(color="orange") + geom_smooth(method = "lm")
ggplot(entrenaNumericas, aes(x="1stFlrSF", y=SalePrice)) +
geom_point(color="purple") + geom_smooth(method = "lm")
1stFlrSF, esto agrga una nueva variable y no se necesita llenarnos de más variables, no es tan práctico, mejor la siguiente …1stFlrSF=“lstFlrSF”)) # aprovechando la librería reshapeentrenaNumericas = rename(entrenaNumericas, c(`1stFlrSF`="lstFlrSF")) # opcion 2, renombra la variable
modelo1 <- lm(formula = SalePrice ~ OverallQual +
GrLivArea + GarageCars + GarageArea +
TotalBsmtSF + + FullBath + lstFlrSF +
TotRmsAbvGrd + YearBuilt + YearRemodAdd, entrenaNumericas)
summary(modelo1)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + +FullBath + lstFlrSF + TotRmsAbvGrd +
## YearBuilt + YearRemodAdd, data = entrenaNumericas)
##
## Residuals:
## Min 1Q Median 3Q Max
## -489958 -19316 -1948 16020 290558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.186e+06 1.291e+05 -9.187 < 2e-16 ***
## OverallQual 1.960e+04 1.190e+03 16.472 < 2e-16 ***
## GrLivArea 5.130e+01 4.233e+00 12.119 < 2e-16 ***
## GarageCars 1.042e+04 3.044e+03 3.422 0.000639 ***
## GarageArea 1.495e+01 1.031e+01 1.450 0.147384
## TotalBsmtSF 1.986e+01 4.295e+00 4.625 4.09e-06 ***
## FullBath -6.791e+03 2.682e+03 -2.532 0.011457 *
## lstFlrSF 1.417e+01 4.930e+00 2.875 0.004097 **
## TotRmsAbvGrd 3.310e+01 1.119e+03 0.030 0.976404
## YearBuilt 2.682e+02 5.035e+01 5.328 1.15e-07 ***
## YearRemodAdd 2.965e+02 6.363e+01 4.659 3.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37920 on 1449 degrees of freedom
## Multiple R-squared: 0.7737, Adjusted R-squared: 0.7721
## F-statistic: 495.4 on 10 and 1449 DF, p-value: < 2.2e-16
2ndFlrSF3SsnPorchentrenaNumericas = rename(entrenaNumericas, c(`2ndFlrSF`="tndFlrSF")) # renombra la variable
entrenaNumericas = rename(entrenaNumericas, c(`3SsnPorch`="tSsnPorch")) # renombra la variable
modelo2 <- lm(formula = SalePrice ~ ., data = entrenaNumericas)
modelo2
##
## Call:
## lm(formula = SalePrice ~ ., data = entrenaNumericas)
##
## Coefficients:
## (Intercept) Id MSSubClass LotFrontage LotArea
## -3.351e+05 -1.205e+00 -2.001e+02 -1.160e+02 5.422e-01
## OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea
## 1.866e+04 5.239e+03 3.164e+02 1.194e+02 3.141e+01
## BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF lstFlrSF
## 1.736e+01 8.342e+00 5.005e+00 NA 4.597e+01
## tndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 4.663e+01 3.341e+01 NA 9.043e+03 2.465e+03
## FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
## 5.433e+03 -1.098e+03 -1.022e+04 -2.202e+04 5.464e+03
## Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF
## 4.372e+03 -4.728e+01 1.685e+04 6.274e+00 2.144e+01
## OpenPorchSF EnclosedPorch tSsnPorch ScreenPorch PoolArea
## -2.252e+00 7.295e+00 3.349e+01 5.805e+01 -6.052e+01
## MiscVal MoSold YrSold
## -3.761e+00 -2.217e+02 -2.474e+02
summary(modelo2)
##
## Call:
## lm(formula = SalePrice ~ ., data = entrenaNumericas)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442182 -16955 -2824 15125 318183
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.351e+05 1.701e+06 -0.197 0.843909
## Id -1.205e+00 2.658e+00 -0.453 0.650332
## MSSubClass -2.001e+02 3.451e+01 -5.797 8.84e-09 ***
## LotFrontage -1.160e+02 6.126e+01 -1.894 0.058503 .
## LotArea 5.422e-01 1.575e-01 3.442 0.000599 ***
## OverallQual 1.866e+04 1.482e+03 12.592 < 2e-16 ***
## OverallCond 5.239e+03 1.368e+03 3.830 0.000135 ***
## YearBuilt 3.164e+02 8.766e+01 3.610 0.000321 ***
## YearRemodAdd 1.194e+02 8.668e+01 1.378 0.168607
## MasVnrArea 3.141e+01 7.022e+00 4.473 8.54e-06 ***
## BsmtFinSF1 1.736e+01 5.838e+00 2.973 0.003014 **
## BsmtFinSF2 8.342e+00 8.766e+00 0.952 0.341532
## BsmtUnfSF 5.005e+00 5.277e+00 0.948 0.343173
## TotalBsmtSF NA NA NA NA
## lstFlrSF 4.597e+01 7.360e+00 6.246 6.02e-10 ***
## tndFlrSF 4.663e+01 6.102e+00 7.641 4.72e-14 ***
## LowQualFinSF 3.341e+01 2.794e+01 1.196 0.232009
## GrLivArea NA NA NA NA
## BsmtFullBath 9.043e+03 3.198e+03 2.828 0.004776 **
## BsmtHalfBath 2.465e+03 5.073e+03 0.486 0.627135
## FullBath 5.433e+03 3.531e+03 1.539 0.124182
## HalfBath -1.098e+03 3.321e+03 -0.331 0.740945
## BedroomAbvGr -1.022e+04 2.155e+03 -4.742 2.40e-06 ***
## KitchenAbvGr -2.202e+04 6.710e+03 -3.282 0.001063 **
## TotRmsAbvGrd 5.464e+03 1.487e+03 3.674 0.000251 ***
## Fireplaces 4.372e+03 2.189e+03 1.998 0.046020 *
## GarageYrBlt -4.728e+01 9.106e+01 -0.519 0.603742
## GarageCars 1.685e+04 3.491e+03 4.827 1.58e-06 ***
## GarageArea 6.274e+00 1.213e+01 0.517 0.605002
## WoodDeckSF 2.144e+01 1.002e+01 2.139 0.032662 *
## OpenPorchSF -2.252e+00 1.949e+01 -0.116 0.907998
## EnclosedPorch 7.295e+00 2.062e+01 0.354 0.723590
## tSsnPorch 3.349e+01 3.758e+01 0.891 0.373163
## ScreenPorch 5.805e+01 2.041e+01 2.844 0.004532 **
## PoolArea -6.052e+01 2.990e+01 -2.024 0.043204 *
## MiscVal -3.761e+00 6.960e+00 -0.540 0.589016
## MoSold -2.217e+02 4.229e+02 -0.524 0.600188
## YrSold -2.474e+02 8.458e+02 -0.293 0.769917
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36800 on 1085 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.8096, Adjusted R-squared: 0.8034
## F-statistic: 131.8 on 35 and 1085 DF, p-value: < 2.2e-16
modelo3 <- lm(formula = SalePrice ~ LotArea +
OverallQual + OverallCond + YearBuilt +
BsmtFinSF1 + lstFlrSF + tndFlrSF +
BedroomAbvGr + TotRmsAbvGrd + GarageCars +ScreenPorch, entrenaNumericas )
modelo3
##
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + OverallCond +
## YearBuilt + BsmtFinSF1 + lstFlrSF + tndFlrSF + BedroomAbvGr +
## TotRmsAbvGrd + GarageCars + ScreenPorch, data = entrenaNumericas)
##
## Coefficients:
## (Intercept) LotArea OverallQual OverallCond YearBuilt
## -9.425e+05 5.682e-01 2.011e+04 5.951e+03 4.252e+02
## BsmtFinSF1 lstFlrSF tndFlrSF BedroomAbvGr TotRmsAbvGrd
## 1.837e+01 6.334e+01 4.595e+01 -9.360e+03 4.630e+03
## GarageCars ScreenPorch
## 1.213e+04 5.405e+01
summary(modelo3)
##
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + OverallCond +
## YearBuilt + BsmtFinSF1 + lstFlrSF + tndFlrSF + BedroomAbvGr +
## TotRmsAbvGrd + GarageCars + ScreenPorch, data = entrenaNumericas)
##
## Residuals:
## Min 1Q Median 3Q Max
## -514848 -17297 -1087 14557 287694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.425e+05 8.999e+04 -10.474 < 2e-16 ***
## LotArea 5.682e-01 1.019e-01 5.577 2.92e-08 ***
## OverallQual 2.011e+04 1.123e+03 17.916 < 2e-16 ***
## OverallCond 5.951e+03 9.467e+02 6.286 4.31e-10 ***
## YearBuilt 4.252e+02 4.595e+01 9.255 < 2e-16 ***
## BsmtFinSF1 1.837e+01 2.432e+00 7.554 7.43e-14 ***
## lstFlrSF 6.334e+01 4.569e+00 13.864 < 2e-16 ***
## tndFlrSF 4.595e+01 3.977e+00 11.555 < 2e-16 ***
## BedroomAbvGr -9.360e+03 1.686e+03 -5.552 3.36e-08 ***
## TotRmsAbvGrd 4.630e+03 1.210e+03 3.826 0.000136 ***
## GarageCars 1.213e+04 1.749e+03 6.937 6.03e-12 ***
## ScreenPorch 5.405e+01 1.724e+01 3.136 0.001748 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36220 on 1448 degrees of freedom
## Multiple R-squared: 0.7937, Adjusted R-squared: 0.7921
## F-statistic: 506.4 on 11 and 1448 DF, p-value: < 2.2e-16