Random Forest es un algoritmo de aprendizaje a.utmático supervisado que se sa para clasificar y/o hacer regresiones. Se basas en la creación de multiples arboles de decison y combina sus resultados para hacer predicciones más precisas y estables.
## Id MSSubClass MSZoning LotArea
## Min. : 0.0 Min. : 20.00 Length:2919 Min. : 1300
## 1st Qu.: 729.5 1st Qu.: 20.00 Class :character 1st Qu.: 7478
## Median :1459.0 Median : 50.00 Mode :character Median : 9453
## Mean :1459.0 Mean : 57.14 Mean : 10168
## 3rd Qu.:2188.5 3rd Qu.: 70.00 3rd Qu.: 11570
## Max. :2918.0 Max. :190.00 Max. :215245
##
## LotConfig BldgType OverallCond YearBuilt
## Length:2919 Length:2919 Min. :1.000 Min. :1872
## Class :character Class :character 1st Qu.:5.000 1st Qu.:1954
## Mode :character Mode :character Median :5.000 Median :1973
## Mean :5.565 Mean :1971
## 3rd Qu.:6.000 3rd Qu.:2001
## Max. :9.000 Max. :2010
##
## YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF
## Min. :1950 Length:2919 Min. : 0.00 Min. : 0.0
## 1st Qu.:1965 Class :character 1st Qu.: 0.00 1st Qu.: 793.0
## Median :1993 Mode :character Median : 0.00 Median : 989.5
## Mean :1984 Mean : 49.58 Mean :1051.8
## 3rd Qu.:2004 3rd Qu.: 0.00 3rd Qu.:1302.0
## Max. :2010 Max. :1526.00 Max. :6110.0
## NA's :1 NA's :1
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
## NA's :1459
## Id MSSubClass MSZoning LotArea LotConfig BldgType OverallCond YearBuilt
## 1 0 60 RL 8450 Inside 1Fam 5 2003
## 2 1 20 RL 9600 FR2 1Fam 8 1976
## 3 2 60 RL 11250 Inside 1Fam 5 2001
## 4 3 70 RL 9550 Corner 1Fam 5 1915
## 5 4 60 RL 14260 FR2 1Fam 5 2000
## 6 5 50 RL 14115 Inside 1Fam 5 1993
## YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF SalePrice
## 1 2003 VinylSd 0 856 208500
## 2 1976 MetalSd 0 1262 181500
## 3 2002 VinylSd 0 920 223500
## 4 1970 Wd Sdng 0 756 140000
## 5 2000 VinylSd 0 1145 250000
## 6 1995 VinylSd 0 796 143000
# Convertir múltiples columnas a factor
df <- df %>%
mutate(
MSZoning = as.factor(MSZoning),
LotConfig = as.factor(LotConfig),
BldgType = as.factor(BldgType),
Exterior1st = as.factor(Exterior1st),
SalesPrice = as.numeric(SalePrice)
)
str(df)## 'data.frame': 2919 obs. of 14 variables:
## $ Id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 6 levels "","C (all)","FV",..: 5 5 5 5 5 5 5 5 6 5 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd: int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ Exterior1st : Factor w/ 16 levels "","AsbShng","AsphShn",..: 14 10 14 15 14 14 14 8 5 10 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## $ SalesPrice : num 208500 181500 223500 140000 250000 ...
#Semilla para reproducibilidad
set.seed(123)
#Dividir los datos en entrenamiento y prueba
r_train <- createDataPartition(df$SalePrice, p=0.7, list=FALSE)
train <- df[r_train, ]
test <- df[-r_train, ]
modelo <- randomForest(SalesPrice ~., data=train)##
## Adjuntando el paquete: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
# Predicciones (numéricas)
pred_train <- predict(modelo, train)
real_train <- train$SalePrice
# Calcular RMSE en train
rmse_train <- rmse(real_train, pred_train)
cat("RMSE en train:", rmse_train, "\n")## RMSE en train: 7650.706
# Calcular R^2 en train
r2_train <- cor(real_train, pred_train)^2
cat("R^2 en train:", r2_train, "\n")## R^2 en train: 0.992778
# Realizar predicciones en el conjunto de test
pred_test <- predict(modelo, test)
# Evaluar el desempeño en test usando postResample (RMSE, R², MAE)
metrics_test <- postResample(pred = pred_test, obs = test$SalePrice)
print(metrics_test)## RMSE Rsquared MAE
## 1.319967e+04 9.769294e-01 5.807615e+03
# Alternativamente, usando el paquete Metrics:
rmse_test <- rmse(test$SalePrice, pred_test)
mae_test <- mae(test$SalePrice, pred_test)
r2_test <- cor(test$SalePrice, pred_test)^2
cat("Test RMSE:", rmse_test, "\n")## Test RMSE: 13199.67
## Test MAE: 5807.615
## Test R²: 0.9769294