Teoría

Random Forest es un algorítmo de aprendizaje automático supervisado que se usa para clasificar y/o hacer regresiones en la creación de múltiples árboles de decisión y combina sus resultados para hacer predicciones más precisas y estables.

Importar librerías

library(randomForest)
library(caret)
library(readr)
library(lattice)

Importar base de datos

df<- read_csv("House Prices.csv")
## Rows: 2919 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): MSZoning, LotConfig, BldgType, Exterior1st
## dbl (9): Id, MSSubClass, LotArea, OverallCond, YearBuilt, YearRemodAdd, Bsmt...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(House_Prices)

Entender la base de datos

summary(df)
##        Id           MSSubClass       MSZoning            LotArea      
##  Min.   :   0.0   Min.   : 20.00   Length:2919        Min.   :  1300  
##  1st Qu.: 729.5   1st Qu.: 20.00   Class :character   1st Qu.:  7478  
##  Median :1459.0   Median : 50.00   Mode  :character   Median :  9453  
##  Mean   :1459.0   Mean   : 57.14                      Mean   : 10168  
##  3rd Qu.:2188.5   3rd Qu.: 70.00                      3rd Qu.: 11570  
##  Max.   :2918.0   Max.   :190.00                      Max.   :215245  
##                                                                       
##   LotConfig           BldgType          OverallCond      YearBuilt   
##  Length:2919        Length:2919        Min.   :1.000   Min.   :1872  
##  Class :character   Class :character   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Mode  :character   Median :5.000   Median :1973  
##                                        Mean   :5.565   Mean   :1971  
##                                        3rd Qu.:6.000   3rd Qu.:2001  
##                                        Max.   :9.000   Max.   :2010  
##                                                                      
##   YearRemodAdd  Exterior1st          BsmtFinSF2       TotalBsmtSF    
##  Min.   :1950   Length:2919        Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:1965   Class :character   1st Qu.:   0.00   1st Qu.: 793.0  
##  Median :1993   Mode  :character   Median :   0.00   Median : 989.5  
##  Mean   :1984                      Mean   :  49.58   Mean   :1051.8  
##  3rd Qu.:2004                      3rd Qu.:   0.00   3rd Qu.:1302.0  
##  Max.   :2010                      Max.   :1526.00   Max.   :6110.0  
##                                    NA's   :1         NA's   :1       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
##  NA's   :1459
head(df)
## # A tibble: 6 × 13
##      Id MSSubClass MSZoning LotArea LotConfig BldgType OverallCond YearBuilt
##   <dbl>      <dbl> <chr>      <dbl> <chr>     <chr>          <dbl>     <dbl>
## 1     0         60 RL          8450 Inside    1Fam               5      2003
## 2     1         20 RL          9600 FR2       1Fam               8      1976
## 3     2         60 RL         11250 Inside    1Fam               5      2001
## 4     3         70 RL          9550 Corner    1Fam               5      1915
## 5     4         60 RL         14260 FR2       1Fam               5      2000
## 6     5         50 RL         14115 Inside    1Fam               5      1993
## # ℹ 5 more variables: YearRemodAdd <dbl>, Exterior1st <chr>, BsmtFinSF2 <dbl>,
## #   TotalBsmtSF <dbl>, SalePrice <dbl>
str(df)
## spc_tbl_ [2,919 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id          : num [1:2919] 0 1 2 3 4 5 6 7 8 9 ...
##  $ MSSubClass  : num [1:2919] 60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning    : chr [1:2919] "RL" "RL" "RL" "RL" ...
##  $ LotArea     : num [1:2919] 8450 9600 11250 9550 14260 ...
##  $ LotConfig   : chr [1:2919] "Inside" "FR2" "Inside" "Corner" ...
##  $ BldgType    : chr [1:2919] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ OverallCond : num [1:2919] 5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt   : num [1:2919] 2003 1976 2001 1915 2000 ...
##  $ YearRemodAdd: num [1:2919] 2003 1976 2002 1970 2000 ...
##  $ Exterior1st : chr [1:2919] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ BsmtFinSF2  : num [1:2919] 0 0 0 0 0 0 0 32 0 0 ...
##  $ TotalBsmtSF : num [1:2919] 856 1262 920 756 1145 ...
##  $ SalePrice   : num [1:2919] 208500 181500 223500 140000 250000 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   MSSubClass = col_double(),
##   ..   MSZoning = col_character(),
##   ..   LotArea = col_double(),
##   ..   LotConfig = col_character(),
##   ..   BldgType = col_character(),
##   ..   OverallCond = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   YearRemodAdd = col_double(),
##   ..   Exterior1st = col_character(),
##   ..   BsmtFinSF2 = col_double(),
##   ..   TotalBsmtSF = col_double(),
##   ..   SalePrice = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
df$MSZoning <- as.factor(df$MSZoning) 
df$LotConfig <- as.factor(df$LotConfig)
df$BldgType <- as.factor(df$BldgType)
df$Exterior1st <- as.factor(df$Exterior1st)
df$SalePrice <- as.numeric(df$SalePrice)
str(df)
## spc_tbl_ [2,919 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id          : num [1:2919] 0 1 2 3 4 5 6 7 8 9 ...
##  $ MSSubClass  : num [1:2919] 60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning    : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotArea     : num [1:2919] 8450 9600 11250 9550 14260 ...
##  $ LotConfig   : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ BldgType    : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ OverallCond : num [1:2919] 5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt   : num [1:2919] 2003 1976 2001 1915 2000 ...
##  $ YearRemodAdd: num [1:2919] 2003 1976 2002 1970 2000 ...
##  $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ BsmtFinSF2  : num [1:2919] 0 0 0 0 0 0 0 32 0 0 ...
##  $ TotalBsmtSF : num [1:2919] 856 1262 920 756 1145 ...
##  $ SalePrice   : num [1:2919] 208500 181500 223500 140000 250000 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   MSSubClass = col_double(),
##   ..   MSZoning = col_character(),
##   ..   LotArea = col_double(),
##   ..   LotConfig = col_character(),
##   ..   BldgType = col_character(),
##   ..   OverallCond = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   YearRemodAdd = col_double(),
##   ..   Exterior1st = col_character(),
##   ..   BsmtFinSF2 = col_double(),
##   ..   TotalBsmtSF = col_double(),
##   ..   SalePrice = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
df <- na.omit(df)

Entrenamiento del modelo

set.seed(123)
renglones_entrenamiento <- createDataPartition(df$SalePrice, p = 0.7, list = FALSE)
entrenamiento <- df[renglones_entrenamiento, ]
prueba <- df[-renglones_entrenamiento, ]
modelo <- randomForest(SalePrice~., data= entrenamiento)
print(modelo)
## 
## Call:
##  randomForest(formula = SalePrice ~ ., data = entrenamiento) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 1621295030
##                     % Var explained: 74.72

Evaluar el modelo

evaluar_entrenamiento <- predict(modelo, entrenamiento)
evaluar_prueba <- predict(modelo, prueba)
#matriz_confusion_entrenamiento <- confusionMatrix(evaluar_entrenamiento,entrenamiento)

Generar predicciones

prediccion <- predict(modelo, prueba)
LS0tDQp0aXRsZTogIlJhbmRvbUZvcmVzdCINCmF1dGhvcjogIlNhbWFudGhhIC0gQTAxNDIyNzQ5Ig0KZGF0ZTogIjIwMjUtMDItMjQiDQpvdXRwdXQ6IA0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogVFJVRQ0KICAgIHRvY19mbG9hdDogVFJVRQ0KICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUNCiAgICB0aGVtZTogam91cm5hbA0KLS0tDQohW10oQzovVXNlcnMvYWxlamEvUGljdHVyZXMvSW1hZ2VuZXNfdHJhYmFqb3MvQ2FzYXMuZ2lmKQ0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogZ3JlZW47Ij5UZW9yw61hPC9zcGFuPg0KDQoqKlJhbmRvbSBGb3Jlc3QqKiBlcyB1biBhbGdvcsOtdG1vIGRlIGFwcmVuZGl6YWplIGF1dG9tw6F0aWNvIHN1cGVydmlzYWRvIHF1ZSBzZSB1c2EgcGFyYSBjbGFzaWZpY2FyIHkvbyBoYWNlciByZWdyZXNpb25lcyBlbiBsYSBjcmVhY2nDs24gZGUgbcO6bHRpcGxlcyDDoXJib2xlcyBkZSBkZWNpc2nDs24geSBjb21iaW5hIHN1cyByZXN1bHRhZG9zIHBhcmEgaGFjZXIgcHJlZGljY2lvbmVzIG3DoXMgcHJlY2lzYXMgeSBlc3RhYmxlcy4NCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IGdyZWVuOyI+SW1wb3J0YXIgbGlicmVyw61hczwvc3Bhbj4NCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkNCmxpYnJhcnkoY2FyZXQpDQpsaWJyYXJ5KHJlYWRyKQ0KbGlicmFyeShsYXR0aWNlKQ0KYGBgDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBncmVlbjsiPkltcG9ydGFyIGJhc2UgZGUgZGF0b3M8L3NwYW4+DQpgYGB7cn0NCmRmPC0gcmVhZF9jc3YoIkhvdXNlIFByaWNlcy5jc3YiKQ0KI1ZpZXcoSG91c2VfUHJpY2VzKQ0KYGBgDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBncmVlbjsiPkVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3M8L3NwYW4+DQpgYGB7cn0NCnN1bW1hcnkoZGYpDQpoZWFkKGRmKQ0Kc3RyKGRmKQ0KZGYkTVNab25pbmcgPC0gYXMuZmFjdG9yKGRmJE1TWm9uaW5nKSANCmRmJExvdENvbmZpZyA8LSBhcy5mYWN0b3IoZGYkTG90Q29uZmlnKQ0KZGYkQmxkZ1R5cGUgPC0gYXMuZmFjdG9yKGRmJEJsZGdUeXBlKQ0KZGYkRXh0ZXJpb3Ixc3QgPC0gYXMuZmFjdG9yKGRmJEV4dGVyaW9yMXN0KQ0KZGYkU2FsZVByaWNlIDwtIGFzLm51bWVyaWMoZGYkU2FsZVByaWNlKQ0Kc3RyKGRmKQ0KZGYgPC0gbmEub21pdChkZikNCmBgYA0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogZ3JlZW47Ij5FbnRyZW5hbWllbnRvIGRlbCBtb2RlbG88L3NwYW4+DQpgYGB7cn0NCnNldC5zZWVkKDEyMykNCnJlbmdsb25lc19lbnRyZW5hbWllbnRvIDwtIGNyZWF0ZURhdGFQYXJ0aXRpb24oZGYkU2FsZVByaWNlLCBwID0gMC43LCBsaXN0ID0gRkFMU0UpDQplbnRyZW5hbWllbnRvIDwtIGRmW3Jlbmdsb25lc19lbnRyZW5hbWllbnRvLCBdDQpwcnVlYmEgPC0gZGZbLXJlbmdsb25lc19lbnRyZW5hbWllbnRvLCBdDQptb2RlbG8gPC0gcmFuZG9tRm9yZXN0KFNhbGVQcmljZX4uLCBkYXRhPSBlbnRyZW5hbWllbnRvKQ0KcHJpbnQobW9kZWxvKQ0KYGBgDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBncmVlbjsiPkV2YWx1YXIgZWwgbW9kZWxvPC9zcGFuPg0KYGBge3J9DQpldmFsdWFyX2VudHJlbmFtaWVudG8gPC0gcHJlZGljdChtb2RlbG8sIGVudHJlbmFtaWVudG8pDQpldmFsdWFyX3BydWViYSA8LSBwcmVkaWN0KG1vZGVsbywgcHJ1ZWJhKQ0KI21hdHJpel9jb25mdXNpb25fZW50cmVuYW1pZW50byA8LSBjb25mdXNpb25NYXRyaXgoZXZhbHVhcl9lbnRyZW5hbWllbnRvLGVudHJlbmFtaWVudG8pDQpgYGANCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IGdyZWVuOyI+R2VuZXJhciBwcmVkaWNjaW9uZXM8L3NwYW4+DQpgYGB7cn0NCnByZWRpY2Npb24gPC0gcHJlZGljdChtb2RlbG8sIHBydWViYSkNCmBgYA==