Random Forest es un algorítmo de aprendizaje automático supervisado que se usa para clasificar y/o hacer regresiones en la creación de múltiples árboles de decisión y combina sus resultados para hacer predicciones más precisas y estables.
library(randomForest)
library(caret)
library(readr)
library(lattice)
df<- read_csv("House Prices.csv")
## Rows: 2919 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): MSZoning, LotConfig, BldgType, Exterior1st
## dbl (9): Id, MSSubClass, LotArea, OverallCond, YearBuilt, YearRemodAdd, Bsmt...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(House_Prices)
summary(df)
## Id MSSubClass MSZoning LotArea
## Min. : 0.0 Min. : 20.00 Length:2919 Min. : 1300
## 1st Qu.: 729.5 1st Qu.: 20.00 Class :character 1st Qu.: 7478
## Median :1459.0 Median : 50.00 Mode :character Median : 9453
## Mean :1459.0 Mean : 57.14 Mean : 10168
## 3rd Qu.:2188.5 3rd Qu.: 70.00 3rd Qu.: 11570
## Max. :2918.0 Max. :190.00 Max. :215245
##
## LotConfig BldgType OverallCond YearBuilt
## Length:2919 Length:2919 Min. :1.000 Min. :1872
## Class :character Class :character 1st Qu.:5.000 1st Qu.:1954
## Mode :character Mode :character Median :5.000 Median :1973
## Mean :5.565 Mean :1971
## 3rd Qu.:6.000 3rd Qu.:2001
## Max. :9.000 Max. :2010
##
## YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF
## Min. :1950 Length:2919 Min. : 0.00 Min. : 0.0
## 1st Qu.:1965 Class :character 1st Qu.: 0.00 1st Qu.: 793.0
## Median :1993 Mode :character Median : 0.00 Median : 989.5
## Mean :1984 Mean : 49.58 Mean :1051.8
## 3rd Qu.:2004 3rd Qu.: 0.00 3rd Qu.:1302.0
## Max. :2010 Max. :1526.00 Max. :6110.0
## NA's :1 NA's :1
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
## NA's :1459
head(df)
## # A tibble: 6 × 13
## Id MSSubClass MSZoning LotArea LotConfig BldgType OverallCond YearBuilt
## <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 0 60 RL 8450 Inside 1Fam 5 2003
## 2 1 20 RL 9600 FR2 1Fam 8 1976
## 3 2 60 RL 11250 Inside 1Fam 5 2001
## 4 3 70 RL 9550 Corner 1Fam 5 1915
## 5 4 60 RL 14260 FR2 1Fam 5 2000
## 6 5 50 RL 14115 Inside 1Fam 5 1993
## # ℹ 5 more variables: YearRemodAdd <dbl>, Exterior1st <chr>, BsmtFinSF2 <dbl>,
## # TotalBsmtSF <dbl>, SalePrice <dbl>
str(df)
## spc_tbl_ [2,919 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:2919] 0 1 2 3 4 5 6 7 8 9 ...
## $ MSSubClass : num [1:2919] 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr [1:2919] "RL" "RL" "RL" "RL" ...
## $ LotArea : num [1:2919] 8450 9600 11250 9550 14260 ...
## $ LotConfig : chr [1:2919] "Inside" "FR2" "Inside" "Corner" ...
## $ BldgType : chr [1:2919] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ OverallCond : num [1:2919] 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num [1:2919] 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd: num [1:2919] 2003 1976 2002 1970 2000 ...
## $ Exterior1st : chr [1:2919] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ BsmtFinSF2 : num [1:2919] 0 0 0 0 0 0 0 32 0 0 ...
## $ TotalBsmtSF : num [1:2919] 856 1262 920 756 1145 ...
## $ SalePrice : num [1:2919] 208500 181500 223500 140000 250000 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotArea = col_double(),
## .. LotConfig = col_character(),
## .. BldgType = col_character(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. Exterior1st = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. TotalBsmtSF = col_double(),
## .. SalePrice = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
df$MSZoning <- as.factor(df$MSZoning)
df$LotConfig <- as.factor(df$LotConfig)
df$BldgType <- as.factor(df$BldgType)
df$Exterior1st <- as.factor(df$Exterior1st)
df$SalePrice <- as.numeric(df$SalePrice)
str(df)
## spc_tbl_ [2,919 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:2919] 0 1 2 3 4 5 6 7 8 9 ...
## $ MSSubClass : num [1:2919] 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ LotArea : num [1:2919] 8450 9600 11250 9550 14260 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ OverallCond : num [1:2919] 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num [1:2919] 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd: num [1:2919] 2003 1976 2002 1970 2000 ...
## $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
## $ BsmtFinSF2 : num [1:2919] 0 0 0 0 0 0 0 32 0 0 ...
## $ TotalBsmtSF : num [1:2919] 856 1262 920 756 1145 ...
## $ SalePrice : num [1:2919] 208500 181500 223500 140000 250000 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotArea = col_double(),
## .. LotConfig = col_character(),
## .. BldgType = col_character(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. Exterior1st = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. TotalBsmtSF = col_double(),
## .. SalePrice = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
df <- na.omit(df)
set.seed(123)
renglones_entrenamiento <- createDataPartition(df$SalePrice, p = 0.7, list = FALSE)
entrenamiento <- df[renglones_entrenamiento, ]
prueba <- df[-renglones_entrenamiento, ]
modelo <- randomForest(SalePrice~., data= entrenamiento)
print(modelo)
##
## Call:
## randomForest(formula = SalePrice ~ ., data = entrenamiento)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 1621295030
## % Var explained: 74.72
evaluar_entrenamiento <- predict(modelo, entrenamiento)
evaluar_prueba <- predict(modelo, prueba)
#matriz_confusion_entrenamiento <- confusionMatrix(evaluar_entrenamiento,entrenamiento)
prediccion <- predict(modelo, prueba)