Random Forest Es un algoritmo de aprendizaje automático supervisado que se usa para clasificar y/o hacer regresiones. Se basa en la creacion de multiples arboles de decisión y combina sus resultados para hacer predicciones más precisas y estables.
library(randomForest)
library(caret)
df <- read.csv("C:\\Users\\USUARIO\\Desktop\\Bloque - Modulo 2\\House Prices.csv")
df <- na.omit(df)
summary(df)
## Id MSSubClass MSZoning LotArea
## Min. : 0.0 Min. : 20.0 Length:1460 Min. : 1300
## 1st Qu.: 364.8 1st Qu.: 20.0 Class :character 1st Qu.: 7554
## Median : 729.5 Median : 50.0 Mode :character Median : 9478
## Mean : 729.5 Mean : 56.9 Mean : 10517
## 3rd Qu.:1094.2 3rd Qu.: 70.0 3rd Qu.: 11602
## Max. :1459.0 Max. :190.0 Max. :215245
## LotConfig BldgType OverallCond YearBuilt
## Length:1460 Length:1460 Min. :1.000 Min. :1872
## Class :character Class :character 1st Qu.:5.000 1st Qu.:1954
## Mode :character Mode :character Median :5.000 Median :1973
## Mean :5.575 Mean :1971
## 3rd Qu.:6.000 3rd Qu.:2000
## Max. :9.000 Max. :2010
## YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF
## Min. :1950 Length:1460 Min. : 0.00 Min. : 0.0
## 1st Qu.:1967 Class :character 1st Qu.: 0.00 1st Qu.: 795.8
## Median :1994 Mode :character Median : 0.00 Median : 991.5
## Mean :1985 Mean : 46.55 Mean :1057.4
## 3rd Qu.:2004 3rd Qu.: 0.00 3rd Qu.:1298.2
## Max. :2010 Max. :1474.00 Max. :6110.0
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
head(df)
## Id MSSubClass MSZoning LotArea LotConfig BldgType OverallCond YearBuilt
## 1 0 60 RL 8450 Inside 1Fam 5 2003
## 2 1 20 RL 9600 FR2 1Fam 8 1976
## 3 2 60 RL 11250 Inside 1Fam 5 2001
## 4 3 70 RL 9550 Corner 1Fam 5 1915
## 5 4 60 RL 14260 FR2 1Fam 5 2000
## 6 5 50 RL 14115 Inside 1Fam 5 1993
## YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF SalePrice
## 1 2003 VinylSd 0 856 208500
## 2 1976 MetalSd 0 1262 181500
## 3 2002 VinylSd 0 920 223500
## 4 1970 Wd Sdng 0 756 140000
## 5 2000 VinylSd 0 1145 250000
## 6 1995 VinylSd 0 796 143000
str(df)
## 'data.frame': 1460 obs. of 13 variables:
## $ Id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd: int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## - attr(*, "na.action")= 'omit' Named int [1:1459] 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 ...
## ..- attr(*, "names")= chr [1:1459] "1461" "1462" "1463" "1464" ...
df$MSZoning <- as.factor(df$MSZoning)
df$LotConfig <- as.factor(df$LotConfig)
df$BldgType <- as.factor(df$BldgType)
df$Exterior1st <- as.factor(df$Exterior1st)
df$SalePrice <- as.factor(df$SalePrice)
str(df)
## 'data.frame': 1460 obs. of 13 variables:
## $ Id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd: int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ SalePrice : Factor w/ 663 levels "34900","35311",..: 413 340 443 195 495 204 574 391 152 114 ...
## - attr(*, "na.action")= 'omit' Named int [1:1459] 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 ...
## ..- attr(*, "names")= chr [1:1459] "1461" "1462" "1463" "1464" ...
set.seed(123)
renglones_entrenamiento <- createDataPartition(df$SalePrice, p = 0.7, list = FALSE)
entrenamiento <- df[renglones_entrenamiento, ]
prueba <- df[-renglones_entrenamiento, ]
modelo <- randomForest(SalePrice ~ ., data=entrenamiento, ntree=100)
# print(modelo)
evaluaciion_entrenamiento <- predict(modelo, entrenamiento)
evaluacion_prueba <- predict(modelo, prueba)
#matriz_confusion_entrenamiento
prediccion <- predict(modelo, prueba)