Teoría

Random Forest Es un algoritmo de aprendizaje automático supervisado que se usa para clasificar y/o hacer regresiones. Se basa en la creacion de multiples arboles de decisión y combina sus resultados para hacer predicciones más precisas y estables.

Instalar Paquetes y Llamar Librerias

library(randomForest)
library(caret)

Importar Base de Datos

df <- read.csv("C:\\Users\\USUARIO\\Desktop\\Bloque - Modulo 2\\House Prices.csv")
df <- na.omit(df)

Entender la base de datos

summary(df)
##        Id           MSSubClass      MSZoning            LotArea      
##  Min.   :   0.0   Min.   : 20.0   Length:1460        Min.   :  1300  
##  1st Qu.: 364.8   1st Qu.: 20.0   Class :character   1st Qu.:  7554  
##  Median : 729.5   Median : 50.0   Mode  :character   Median :  9478  
##  Mean   : 729.5   Mean   : 56.9                      Mean   : 10517  
##  3rd Qu.:1094.2   3rd Qu.: 70.0                      3rd Qu.: 11602  
##  Max.   :1459.0   Max.   :190.0                      Max.   :215245  
##   LotConfig           BldgType          OverallCond      YearBuilt   
##  Length:1460        Length:1460        Min.   :1.000   Min.   :1872  
##  Class :character   Class :character   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Mode  :character   Median :5.000   Median :1973  
##                                        Mean   :5.575   Mean   :1971  
##                                        3rd Qu.:6.000   3rd Qu.:2000  
##                                        Max.   :9.000   Max.   :2010  
##   YearRemodAdd  Exterior1st          BsmtFinSF2       TotalBsmtSF    
##  Min.   :1950   Length:1460        Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:1967   Class :character   1st Qu.:   0.00   1st Qu.: 795.8  
##  Median :1994   Mode  :character   Median :   0.00   Median : 991.5  
##  Mean   :1985                      Mean   :  46.55   Mean   :1057.4  
##  3rd Qu.:2004                      3rd Qu.:   0.00   3rd Qu.:1298.2  
##  Max.   :2010                      Max.   :1474.00   Max.   :6110.0  
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000
head(df)
##   Id MSSubClass MSZoning LotArea LotConfig BldgType OverallCond YearBuilt
## 1  0         60       RL    8450    Inside     1Fam           5      2003
## 2  1         20       RL    9600       FR2     1Fam           8      1976
## 3  2         60       RL   11250    Inside     1Fam           5      2001
## 4  3         70       RL    9550    Corner     1Fam           5      1915
## 5  4         60       RL   14260       FR2     1Fam           5      2000
## 6  5         50       RL   14115    Inside     1Fam           5      1993
##   YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF SalePrice
## 1         2003     VinylSd          0         856    208500
## 2         1976     MetalSd          0        1262    181500
## 3         2002     VinylSd          0         920    223500
## 4         1970     Wd Sdng          0         756    140000
## 5         2000     VinylSd          0        1145    250000
## 6         1995     VinylSd          0         796    143000
str(df)
## 'data.frame':    1460 obs. of  13 variables:
##  $ Id          : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ MSSubClass  : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning    : chr  "RL" "RL" "RL" "RL" ...
##  $ LotArea     : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ LotConfig   : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ BldgType    : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ OverallCond : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt   : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd: int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ Exterior1st : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ BsmtFinSF2  : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ TotalBsmtSF : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ SalePrice   : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  - attr(*, "na.action")= 'omit' Named int [1:1459] 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 ...
##   ..- attr(*, "names")= chr [1:1459] "1461" "1462" "1463" "1464" ...
df$MSZoning <- as.factor(df$MSZoning)
df$LotConfig <- as.factor(df$LotConfig)
df$BldgType <- as.factor(df$BldgType)
df$Exterior1st <- as.factor(df$Exterior1st)
df$SalePrice <- as.factor(df$SalePrice)
str(df)
## 'data.frame':    1460 obs. of  13 variables:
##  $ Id          : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ MSSubClass  : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning    : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotArea     : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ LotConfig   : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ BldgType    : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ OverallCond : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt   : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd: int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ BsmtFinSF2  : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ TotalBsmtSF : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ SalePrice   : Factor w/ 663 levels "34900","35311",..: 413 340 443 195 495 204 574 391 152 114 ...
##  - attr(*, "na.action")= 'omit' Named int [1:1459] 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 ...
##   ..- attr(*, "names")= chr [1:1459] "1461" "1462" "1463" "1464" ...

Entrenar el Modelo

set.seed(123)
renglones_entrenamiento <- createDataPartition(df$SalePrice, p = 0.7, list = FALSE)
entrenamiento <- df[renglones_entrenamiento, ]
prueba <- df[-renglones_entrenamiento, ]
modelo <- randomForest(SalePrice ~ ., data=entrenamiento, ntree=100)
# print(modelo)

Evaluar El Modelo

evaluaciion_entrenamiento <- predict(modelo, entrenamiento)
evaluacion_prueba <- predict(modelo, prueba)
#matriz_confusion_entrenamiento

Generar Predicciones

prediccion <- predict(modelo, prueba)
LS0tDQp0aXRsZTogIlJGIg0KYXV0aG9yOiAiRmFiaWFuYSBNZWRpbmFjZWxsaSAtIEEwMDgzNTg5NiINCmRhdGU6ICIyMDI1LTAyLTI0Ig0Kb3V0cHV0OiANCiAgaHRtbF9kb2N1bWVudDoNCiAgICAgIHRvYzogVFJVRQ0KICAgICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgICBjb2RlX2Rvd25sb2FkOiBUUlVFDQogICAgICB0aGVtZTogam91cm5hbA0KLS0tDQoNCiFbXShDOlxcVXNlcnNcXFVTVUFSSU9cXERlc2t0b3BcXEJsb3F1ZSAtIE1vZHVsbyAyXFx0dW1ibHJfZWQxN2FhNjdiNzQyOGM3NjNmNGQ4NDYwMDNmNjI2YzNfMGU0YWI0YjJfNjQwLmdpZikNCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG9yYW5nZTsiPlRlb3LDrWE8L3NwYW4+ICANCg0KKipSYW5kb20gRm9yZXN0KiogRXMgdW4gYWxnb3JpdG1vIGRlIGFwcmVuZGl6YWplIGF1dG9tw6F0aWNvIHN1cGVydmlzYWRvIHF1ZSBzZSB1c2EgcGFyYSBjbGFzaWZpY2FyIHkvbyBoYWNlciByZWdyZXNpb25lcy4gU2UgYmFzYSBlbiBsYSBjcmVhY2lvbiBkZSBtdWx0aXBsZXMgYXJib2xlcyBkZSBkZWNpc2nDs24geSBjb21iaW5hIHN1cyByZXN1bHRhZG9zIHBhcmEgaGFjZXIgcHJlZGljY2lvbmVzIG3DoXMgcHJlY2lzYXMgeSBlc3RhYmxlcy4NCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG9yYW5nZTsiPkluc3RhbGFyIFBhcXVldGVzIHkgTGxhbWFyIExpYnJlcmlhczwvc3Bhbj4gIA0KYGBge3IgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0NCmxpYnJhcnkocmFuZG9tRm9yZXN0KQ0KbGlicmFyeShjYXJldCkNCmBgYA0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG9yYW5nZTsiPkltcG9ydGFyIEJhc2UgZGUgRGF0b3M8L3NwYW4+ICANCmBgYHtyfQ0KZGYgPC0gcmVhZC5jc3YoIkM6XFxVc2Vyc1xcVVNVQVJJT1xcRGVza3RvcFxcQmxvcXVlIC0gTW9kdWxvIDJcXEhvdXNlIFByaWNlcy5jc3YiKQ0KZGYgPC0gbmEub21pdChkZikNCmBgYA0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG9yYW5nZTsiPkVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3M8L3NwYW4+ICANCmBgYHtyfQ0Kc3VtbWFyeShkZikNCmhlYWQoZGYpDQpzdHIoZGYpDQpkZiRNU1pvbmluZyA8LSBhcy5mYWN0b3IoZGYkTVNab25pbmcpDQpkZiRMb3RDb25maWcgPC0gYXMuZmFjdG9yKGRmJExvdENvbmZpZykNCmRmJEJsZGdUeXBlIDwtIGFzLmZhY3RvcihkZiRCbGRnVHlwZSkNCmRmJEV4dGVyaW9yMXN0IDwtIGFzLmZhY3RvcihkZiRFeHRlcmlvcjFzdCkNCmRmJFNhbGVQcmljZSA8LSBhcy5mYWN0b3IoZGYkU2FsZVByaWNlKQ0Kc3RyKGRmKQ0KYGBgDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogb3JhbmdlOyI+RW50cmVuYXIgZWwgTW9kZWxvPC9zcGFuPiANCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpzZXQuc2VlZCgxMjMpDQpyZW5nbG9uZXNfZW50cmVuYW1pZW50byA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKGRmJFNhbGVQcmljZSwgcCA9IDAuNywgbGlzdCA9IEZBTFNFKQ0KZW50cmVuYW1pZW50byA8LSBkZltyZW5nbG9uZXNfZW50cmVuYW1pZW50bywgXQ0KcHJ1ZWJhIDwtIGRmWy1yZW5nbG9uZXNfZW50cmVuYW1pZW50bywgXQ0KbW9kZWxvIDwtIHJhbmRvbUZvcmVzdChTYWxlUHJpY2UgfiAuLCBkYXRhPWVudHJlbmFtaWVudG8sIG50cmVlPTEwMCkNCiMgcHJpbnQobW9kZWxvKQ0KYGBgDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogb3JhbmdlOyI+RXZhbHVhciBFbCBNb2RlbG88L3NwYW4+IA0KYGBge3J9DQpldmFsdWFjaWlvbl9lbnRyZW5hbWllbnRvIDwtIHByZWRpY3QobW9kZWxvLCBlbnRyZW5hbWllbnRvKQ0KZXZhbHVhY2lvbl9wcnVlYmEgPC0gcHJlZGljdChtb2RlbG8sIHBydWViYSkNCiNtYXRyaXpfY29uZnVzaW9uX2VudHJlbmFtaWVudG8NCmBgYA0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG9yYW5nZTsiPkdlbmVyYXIgUHJlZGljY2lvbmVzPC9zcGFuPiANCmBgYHtyfQ0KcHJlZGljY2lvbiA8LSBwcmVkaWN0KG1vZGVsbywgcHJ1ZWJhKQ0KYGBgDQoNCg==