Teoría

Random Forest es un algoritmo de aprendizaje a.utmático supervisado que se sa para clasificar y/o hacer regresiones. Se basas en la creación de multiples arboles de decison y combina sus resultados para hacer predicciones más precisas y estables.

Paquetes y librerias

library("dplyr") 
library("tidyverse")
library("randomForest")#Bosques Aleatorios
library("caret") #Entrenamiento de ML

Importar base de datos

df<-read.csv("houseprices.csv")

Entender la base de datos

summary(df)
##        Id           MSSubClass       MSZoning            LotArea      
##  Min.   :   0.0   Min.   : 20.00   Length:2919        Min.   :  1300  
##  1st Qu.: 729.5   1st Qu.: 20.00   Class :character   1st Qu.:  7478  
##  Median :1459.0   Median : 50.00   Mode  :character   Median :  9453  
##  Mean   :1459.0   Mean   : 57.14                      Mean   : 10168  
##  3rd Qu.:2188.5   3rd Qu.: 70.00                      3rd Qu.: 11570  
##  Max.   :2918.0   Max.   :190.00                      Max.   :215245  
##                                                                       
##   LotConfig           BldgType          OverallCond      YearBuilt   
##  Length:2919        Length:2919        Min.   :1.000   Min.   :1872  
##  Class :character   Class :character   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Mode  :character   Median :5.000   Median :1973  
##                                        Mean   :5.565   Mean   :1971  
##                                        3rd Qu.:6.000   3rd Qu.:2001  
##                                        Max.   :9.000   Max.   :2010  
##                                                                      
##   YearRemodAdd  Exterior1st          BsmtFinSF2       TotalBsmtSF    
##  Min.   :1950   Length:2919        Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:1965   Class :character   1st Qu.:   0.00   1st Qu.: 793.0  
##  Median :1993   Mode  :character   Median :   0.00   Median : 989.5  
##  Mean   :1984                      Mean   :  49.58   Mean   :1051.8  
##  3rd Qu.:2004                      3rd Qu.:   0.00   3rd Qu.:1302.0  
##  Max.   :2010                      Max.   :1526.00   Max.   :6110.0  
##                                    NA's   :1         NA's   :1       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
##  NA's   :1459
head(df)
##   Id MSSubClass MSZoning LotArea LotConfig BldgType OverallCond YearBuilt
## 1  0         60       RL    8450    Inside     1Fam           5      2003
## 2  1         20       RL    9600       FR2     1Fam           8      1976
## 3  2         60       RL   11250    Inside     1Fam           5      2001
## 4  3         70       RL    9550    Corner     1Fam           5      1915
## 5  4         60       RL   14260       FR2     1Fam           5      2000
## 6  5         50       RL   14115    Inside     1Fam           5      1993
##   YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF SalePrice
## 1         2003     VinylSd          0         856    208500
## 2         1976     MetalSd          0        1262    181500
## 3         2002     VinylSd          0         920    223500
## 4         1970     Wd Sdng          0         756    140000
## 5         2000     VinylSd          0        1145    250000
## 6         1995     VinylSd          0         796    143000
# Convertir múltiples columnas a factor
df <- df %>%
  mutate(
    MSZoning = as.factor(MSZoning),
    LotConfig = as.factor(LotConfig),
    BldgType = as.factor(BldgType),
    Exterior1st = as.factor(Exterior1st),
    SalesPrice = as.numeric(SalePrice)
  )

str(df)
## 'data.frame':    2919 obs. of  14 variables:
##  $ Id          : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ MSSubClass  : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning    : Factor w/ 6 levels "","C (all)","FV",..: 5 5 5 5 5 5 5 5 6 5 ...
##  $ LotArea     : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ LotConfig   : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ BldgType    : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ OverallCond : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt   : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd: int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ Exterior1st : Factor w/ 16 levels "","AsbShng","AsphShn",..: 14 10 14 15 14 14 14 8 5 10 ...
##  $ BsmtFinSF2  : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ TotalBsmtSF : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ SalePrice   : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  $ SalesPrice  : num  208500 181500 223500 140000 250000 ...
df<-na.omit(df)

Entrenar el modelo

#Semilla para reproducibilidad
set.seed(123)

#Dividir los datos en entrenamiento y prueba
r_train <- createDataPartition(df$SalePrice, p=0.7, list=FALSE)
train <- df[r_train, ]
test <- df[-r_train, ]
modelo <- randomForest(SalesPrice ~., data=train)
# Evaluar un modelo de regresión
library(Metrics)
## 
## Adjuntando el paquete: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
# Predicciones (numéricas)
pred_train <- predict(modelo, train)
real_train <- train$SalePrice

# Calcular RMSE en train
rmse_train <- rmse(real_train, pred_train)
cat("RMSE en train:", rmse_train, "\n")
## RMSE en train: 7650.706
# Calcular R^2 en train
r2_train <- cor(real_train, pred_train)^2
cat("R^2 en train:", r2_train, "\n")
## R^2 en train: 0.992778
# Realizar predicciones en el conjunto de test
pred_test <- predict(modelo, test)

# Evaluar el desempeño en test usando postResample (RMSE, R², MAE)
metrics_test <- postResample(pred = pred_test, obs = test$SalePrice)
print(metrics_test)
##         RMSE     Rsquared          MAE 
## 1.319967e+04 9.769294e-01 5.807615e+03
# Alternativamente, usando el paquete Metrics:
rmse_test <- rmse(test$SalePrice, pred_test)
mae_test  <- mae(test$SalePrice, pred_test)
r2_test   <- cor(test$SalePrice, pred_test)^2

cat("Test RMSE:", rmse_test, "\n")
## Test RMSE: 13199.67
cat("Test MAE:", mae_test, "\n")
## Test MAE: 5807.615
cat("Test R²:", r2_test, "\n")
## Test R²: 0.9769294
LS0tDQp0aXRsZTogIlJhbmRvbSBGb3Jlc3QiDQphdXRob3I6ICJSYXVsIENhbnR1LSBBMDEyODU1MzciDQpkYXRlOiAiMjAyNS0wMi0yMSINCm91dHB1dDogDQogaHRtbF9kb2N1bWVudDoNCiAgICB0b2M6IFRSVUUNCiAgICB0b2NfZmxvYXQ6IFRSVUUNCiAgICBjb2RlX2Rvd25sb2FkOiBUUlVFDQogICAgdGhlbWU6ICJqb3VybmFsIg0KICAgIGhpZ2hsaWdodDogImthdGUiDQotLS0NCg0KDQohW10oRDpcVGVjXFNleHRvIFNlbWVzdHJlXElBIGNvbmNlbnRyYWNpb25cUnN0dWRpb1xNb2R1bG8gMlxIb3VzZS5wbmcpDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmxpZ2h0Z3JlZW47Ij4qKlRlb3LDrWEqKjwvc3Bhbj4NCioqUmFuZG9tIEZvcmVzdCoqIGVzIHVuIGFsZ29yaXRtbyBkZSBhcHJlbmRpemFqZSBhLnV0bcOhdGljbyBzdXBlcnZpc2FkbyANCnF1ZSBzZSBzYSBwYXJhIGNsYXNpZmljYXIgeS9vIGhhY2VyIHJlZ3Jlc2lvbmVzLiBTZSBiYXNhcyBlbiBsYSBjcmVhY2nDs24gZGUgDQptdWx0aXBsZXMgYXJib2xlcyBkZSBkZWNpc29uIHkgY29tYmluYSBzdXMgcmVzdWx0YWRvcyBwYXJhDQpoYWNlciBwcmVkaWNjaW9uZXMgbcOhcyBwcmVjaXNhcyB5IGVzdGFibGVzLg0KDQoNCg0KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOmxpZ2h0Z3JlZW47Ij4qKlBhcXVldGVzIHkgbGlicmVyaWFzKio8L3NwYW4+DQpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KDQpsaWJyYXJ5KCJkcGx5ciIpIA0KbGlicmFyeSgidGlkeXZlcnNlIikNCmxpYnJhcnkoInJhbmRvbUZvcmVzdCIpI0Jvc3F1ZXMgQWxlYXRvcmlvcw0KbGlicmFyeSgiY2FyZXQiKSAjRW50cmVuYW1pZW50byBkZSBNTA0KYGBgDQoNCg0KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOmxpZ2h0Z3JlZW47Ij4qKkltcG9ydGFyIGJhc2UgZGUgZGF0b3MqKjwvc3Bhbj4NCmBgYHtyfQ0KZGY8LXJlYWQuY3N2KCJob3VzZXByaWNlcy5jc3YiKQ0KDQpgYGANCg0KDQojIyA8c3BhbiBzdHlsZT0iY29sb3I6bGlnaHRncmVlbjsiPioqRW50ZW5kZXIgbGEgYmFzZSBkZSBkYXRvcyoqPC9zcGFuPg0KYGBge3J9DQpzdW1tYXJ5KGRmKQ0KaGVhZChkZikNCg0KDQojIENvbnZlcnRpciBtw7psdGlwbGVzIGNvbHVtbmFzIGEgZmFjdG9yDQpkZiA8LSBkZiAlPiUNCiAgbXV0YXRlKA0KICAgIE1TWm9uaW5nID0gYXMuZmFjdG9yKE1TWm9uaW5nKSwNCiAgICBMb3RDb25maWcgPSBhcy5mYWN0b3IoTG90Q29uZmlnKSwNCiAgICBCbGRnVHlwZSA9IGFzLmZhY3RvcihCbGRnVHlwZSksDQogICAgRXh0ZXJpb3Ixc3QgPSBhcy5mYWN0b3IoRXh0ZXJpb3Ixc3QpLA0KICAgIFNhbGVzUHJpY2UgPSBhcy5udW1lcmljKFNhbGVQcmljZSkNCiAgKQ0KDQpzdHIoZGYpDQoNCmRmPC1uYS5vbWl0KGRmKQ0KDQpgYGANCg0KDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjpsaWdodGdyZWVuOyI+KipFbnRyZW5hciBlbCBtb2RlbG8qKjwvc3Bhbj4NCmBgYHtyfQ0KI1NlbWlsbGEgcGFyYSByZXByb2R1Y2liaWxpZGFkDQpzZXQuc2VlZCgxMjMpDQoNCiNEaXZpZGlyIGxvcyBkYXRvcyBlbiBlbnRyZW5hbWllbnRvIHkgcHJ1ZWJhDQpyX3RyYWluIDwtIGNyZWF0ZURhdGFQYXJ0aXRpb24oZGYkU2FsZVByaWNlLCBwPTAuNywgbGlzdD1GQUxTRSkNCnRyYWluIDwtIGRmW3JfdHJhaW4sIF0NCnRlc3QgPC0gZGZbLXJfdHJhaW4sIF0NCm1vZGVsbyA8LSByYW5kb21Gb3Jlc3QoU2FsZXNQcmljZSB+LiwgZGF0YT10cmFpbikNCg0KYGBgDQoNCmBgYHtyfQ0KIyBFdmFsdWFyIHVuIG1vZGVsbyBkZSByZWdyZXNpw7NuDQpsaWJyYXJ5KE1ldHJpY3MpDQoNCiMgUHJlZGljY2lvbmVzIChudW3DqXJpY2FzKQ0KcHJlZF90cmFpbiA8LSBwcmVkaWN0KG1vZGVsbywgdHJhaW4pDQpyZWFsX3RyYWluIDwtIHRyYWluJFNhbGVQcmljZQ0KDQojIENhbGN1bGFyIFJNU0UgZW4gdHJhaW4NCnJtc2VfdHJhaW4gPC0gcm1zZShyZWFsX3RyYWluLCBwcmVkX3RyYWluKQ0KY2F0KCJSTVNFIGVuIHRyYWluOiIsIHJtc2VfdHJhaW4sICJcbiIpDQoNCiMgQ2FsY3VsYXIgUl4yIGVuIHRyYWluDQpyMl90cmFpbiA8LSBjb3IocmVhbF90cmFpbiwgcHJlZF90cmFpbileMg0KY2F0KCJSXjIgZW4gdHJhaW46IiwgcjJfdHJhaW4sICJcbiIpDQoNCg0KYGBgDQoNCmBgYHtyfQ0KIyBSZWFsaXphciBwcmVkaWNjaW9uZXMgZW4gZWwgY29uanVudG8gZGUgdGVzdA0KcHJlZF90ZXN0IDwtIHByZWRpY3QobW9kZWxvLCB0ZXN0KQ0KDQojIEV2YWx1YXIgZWwgZGVzZW1wZcOxbyBlbiB0ZXN0IHVzYW5kbyBwb3N0UmVzYW1wbGUgKFJNU0UsIFLCsiwgTUFFKQ0KbWV0cmljc190ZXN0IDwtIHBvc3RSZXNhbXBsZShwcmVkID0gcHJlZF90ZXN0LCBvYnMgPSB0ZXN0JFNhbGVQcmljZSkNCnByaW50KG1ldHJpY3NfdGVzdCkNCg0KIyBBbHRlcm5hdGl2YW1lbnRlLCB1c2FuZG8gZWwgcGFxdWV0ZSBNZXRyaWNzOg0Kcm1zZV90ZXN0IDwtIHJtc2UodGVzdCRTYWxlUHJpY2UsIHByZWRfdGVzdCkNCm1hZV90ZXN0ICA8LSBtYWUodGVzdCRTYWxlUHJpY2UsIHByZWRfdGVzdCkNCnIyX3Rlc3QgICA8LSBjb3IodGVzdCRTYWxlUHJpY2UsIHByZWRfdGVzdCleMg0KDQpjYXQoIlRlc3QgUk1TRToiLCBybXNlX3Rlc3QsICJcbiIpDQpjYXQoIlRlc3QgTUFFOiIsIG1hZV90ZXN0LCAiXG4iKQ0KY2F0KCJUZXN0IFLCsjoiLCByMl90ZXN0LCAiXG4iKQ0KYGBgDQoNCg0K