Teoría

Random Forest es un algoritmo de aprendizaje automático supervisado que se usa para clasificar y/o hacer regresiones. Se basa en la creación de múltiples árboles de decisión y combina sus resultados para hacer predicciones más precisas y estables.

Instalar paquetes y llamar librerías

#install.packages("randomForest")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
#install.packages("caret") #entrenamiento ML
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice

Importar base de datos

df <- read.csv("/Users/osvaldotello/Desktop/House Prices.csv")

Entender la base de datos

summary(df)
##        Id           MSSubClass       MSZoning            LotArea      
##  Min.   :   0.0   Min.   : 20.00   Length:2919        Min.   :  1300  
##  1st Qu.: 729.5   1st Qu.: 20.00   Class :character   1st Qu.:  7478  
##  Median :1459.0   Median : 50.00   Mode  :character   Median :  9453  
##  Mean   :1459.0   Mean   : 57.14                      Mean   : 10168  
##  3rd Qu.:2188.5   3rd Qu.: 70.00                      3rd Qu.: 11570  
##  Max.   :2918.0   Max.   :190.00                      Max.   :215245  
##                                                                       
##   LotConfig           BldgType          OverallCond      YearBuilt   
##  Length:2919        Length:2919        Min.   :1.000   Min.   :1872  
##  Class :character   Class :character   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Mode  :character   Median :5.000   Median :1973  
##                                        Mean   :5.565   Mean   :1971  
##                                        3rd Qu.:6.000   3rd Qu.:2001  
##                                        Max.   :9.000   Max.   :2010  
##                                                                      
##   YearRemodAdd  Exterior1st          BsmtFinSF2       TotalBsmtSF    
##  Min.   :1950   Length:2919        Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:1965   Class :character   1st Qu.:   0.00   1st Qu.: 793.0  
##  Median :1993   Mode  :character   Median :   0.00   Median : 989.5  
##  Mean   :1984                      Mean   :  49.58   Mean   :1051.8  
##  3rd Qu.:2004                      3rd Qu.:   0.00   3rd Qu.:1302.0  
##  Max.   :2010                      Max.   :1526.00   Max.   :6110.0  
##                                    NA's   :1         NA's   :1       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
##  NA's   :1459
head(df)
##   Id MSSubClass MSZoning LotArea LotConfig BldgType OverallCond YearBuilt
## 1  0         60       RL    8450    Inside     1Fam           5      2003
## 2  1         20       RL    9600       FR2     1Fam           8      1976
## 3  2         60       RL   11250    Inside     1Fam           5      2001
## 4  3         70       RL    9550    Corner     1Fam           5      1915
## 5  4         60       RL   14260       FR2     1Fam           5      2000
## 6  5         50       RL   14115    Inside     1Fam           5      1993
##   YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF SalePrice
## 1         2003     VinylSd          0         856    208500
## 2         1976     MetalSd          0        1262    181500
## 3         2002     VinylSd          0         920    223500
## 4         1970     Wd Sdng          0         756    140000
## 5         2000     VinylSd          0        1145    250000
## 6         1995     VinylSd          0         796    143000
df$MSZoning <- as.factor(df$MSZoning)
df$LotConfig <- as.factor(df$LotConfig)
df$BldgType <- as.factor(df$BldgType)
df$Exterior1st <- as.factor(df$Exterior1st)
df$SalePrice <- as.numeric(df$SalePrice)
str(df)
## 'data.frame':    2919 obs. of  13 variables:
##  $ Id          : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ MSSubClass  : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning    : Factor w/ 6 levels "","C (all)","FV",..: 5 5 5 5 5 5 5 5 6 5 ...
##  $ LotArea     : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ LotConfig   : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ BldgType    : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ OverallCond : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt   : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd: int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ Exterior1st : Factor w/ 16 levels "","AsbShng","AsphShn",..: 14 10 14 15 14 14 14 8 5 10 ...
##  $ BsmtFinSF2  : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ TotalBsmtSF : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ SalePrice   : num  208500 181500 223500 140000 250000 ...
df <- na.omit(df)

Entrenar el modelo

set.seed(123)
renglones_entrenamiento <- createDataPartition(df$SalePrice, p=0.7, list=FALSE)
entrenamiento <- df[renglones_entrenamiento, ]
prueba <- df[-renglones_entrenamiento, ]
modelo <- randomForest(SalePrice ~ ., data=entrenamiento, ntree=100)
print(modelo)
## 
## Call:
##  randomForest(formula = SalePrice ~ ., data = entrenamiento, ntree = 100) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 1651910865
##                     % Var explained: 74.24

Evaluar el modelo

evaluacion_entrenamiento <- predict(modelo, entrenamiento)
evaluacion_prueba <- predict(modelo, prueba)
#matriz_confusion_entrenamiento <- confusionMatrix(evaluacion_entrenamiento, entrenamiento$SalePrice)
#matriz_confusion_prueba <- confusionMatrix(evaluacion_prueba, prueba$SalePrice)

Generar predicciones

prediccion <- predict(modelo, prueba)
prediccion
##         1         4         8        11        13        15        16        22 
## 199809.90 145644.22 165499.91 140458.45 146105.26 154525.71 123443.38 103428.58 
##        30        38        39        41        42        45        52        62 
##  91032.27 154189.87 141982.59 138142.01 156877.53 149910.49 123428.29 125450.16 
##        63        70        72        73        74        75        76        80 
## 208845.84 168173.89 138146.71 199568.58 159280.23 112314.50  93399.10 129047.60 
##        83        99       105       107       110       111       112       120 
## 256561.68 103646.96 117309.19 154548.73 200705.33 131611.53 173580.10 176328.70 
##       125       126       133       134       137       143       144       153 
## 199092.36 121908.64 143763.76 175744.43 153001.83 117980.86 218462.00 175315.15 
##       156       158       162       166       168       175       178       182 
## 110581.92 222828.69 309861.31 161180.75 323204.28 221509.67 145694.74 138054.22 
##       183       187       189       190       196       198       206       211 
## 120402.78 208901.55 139127.51 249980.17 149034.91 241575.94 214484.77 114052.60 
##       215       219       220       224       225       227       228       231 
## 171414.57 209108.97 187350.57 129119.21 388513.46 254101.91 101826.66 139271.72 
##       233       235       237       238       239       240       244       245 
##  95609.88 196072.90 202015.90 224834.26 304001.84 107611.13 157844.22 200293.37 
##       246       252       254       256       262       263       265       266 
## 260357.88 243705.10 173563.56 194935.69 322393.32 155226.22 109946.43 186689.09 
##       273       275       277       278       279       286       289       297 
## 286272.31 134633.73 220596.53 124200.53 385428.54 167524.01 119313.28 134790.78 
##       298       300       303       304       306       310       314       315 
## 194571.86 177569.13 239998.56 136720.89 326686.37 324684.36 229164.35 143373.82 
##       319       322       324       337       344       354       359       366 
## 247582.97 274727.52 146559.61 367070.05 300656.44 110201.23 124199.02 160013.09 
##       372       377       380       381       383       385       388       390 
## 137247.01 191798.23 187433.82 128296.82 217326.64 325471.32 141480.12 341500.79 
##       391       392       393       396       400       403       410       412 
## 113489.87 203365.20 111317.84 139863.23 240598.24 143832.89 284156.29 166283.88 
##       413       415       416       424       427       436       437       443 
## 281913.75 244388.91 194668.17 266177.11 241946.77 205172.33 113906.47 127378.11 
##       446       448       449       451       454       456       457       459 
## 149454.71 206957.98 123700.82 111977.14 213696.37 153442.07  90749.29 126395.88 
##       460       461       462       464       465       473       474       485 
## 107827.77 214500.69 163654.45 234031.71 138927.99 144682.42 364769.98 138526.65 
##       488       490       498       500       502       504       518       519 
## 185546.76 121715.21 131400.96 135552.15 227176.04 225779.24 273288.82 198352.93 
##       520       521       522       524       529       532       538       540 
## 209458.88 139671.35 143731.66 460940.40 120521.77 127865.63 141270.65 231013.25 
##       545       547       549       551       552       556       558       564 
## 199803.45 138706.94 124189.95 138159.57 121747.49 112371.93 117678.96 185300.38 
##       565       571       578       583       591       593       596       598 
## 242056.88 158498.62 159366.42 193920.79 182959.39 134060.52 301759.81 193313.55 
##       608       615       617       620       621       622       623       624 
## 132631.31  93759.34 181344.52 290961.49  96555.01 215190.50 134654.64 172551.65 
##       626       628       632       634       638       641       643       644 
## 145365.26 149023.00 238905.68 153836.49 113485.09 269428.67 268473.96 151569.06 
##       647       649       651       669       673       679       684       685 
##  99052.36 129878.29 235696.47 206144.31 166948.58 351213.34 240629.48 289591.75 
##       691       693       694       697       698       700       714       717 
## 146826.66 261072.89 129201.38  89236.89 128293.17 210015.22 139958.25 173271.98 
##       720       721       722       728       732       733       734       739 
## 136660.12 227022.51 145976.16 187134.04 205532.20 272224.33 140337.84 208700.77 
##       740       742       745       747       754       756       757       758 
## 214535.27 132638.71 194173.54 193246.40 254765.55 171377.84 233844.45 161454.99 
##       764       767       771       776       777       781       785       787 
## 282669.17 195505.58 150346.49 163252.01 244687.22 180843.31 130537.27 126096.26 
##       790       794       795       796       799       804       805       809 
## 176141.71 268118.31 192813.28 147806.98 382088.31 373783.97 118035.41 156792.72 
##       811       813       814       818       819       820       825       830 
## 153338.59  79237.78 151502.54 270797.02 141972.01 215567.26 246767.72 157786.31 
##       831       837       838       840       842       844       847       852 
## 148284.95 155090.35  94112.16 150134.89 142148.13 151137.37 186708.28 192317.39 
##       859       862       863       864       865       866       868       869 
## 155659.07 136400.01 150282.41 142342.92 206873.66 133056.56 128948.26 164853.42 
##       870       875       880       882       883       885       888       893 
## 237333.41 110984.58 137781.10 218924.03 202916.63 119732.58 151759.18 143242.43 
##       894       898       899       905       907       908       914       922 
## 153234.42 122104.18 401171.01 116361.06 275759.30 213361.40 127610.33 166288.53 
##       928       932       937       938       940       941       944       952 
## 141975.65 131862.65 190891.27 266952.30 172641.25 189831.78 168257.41 119947.56 
##       956       960       961       962       963       965       968       974 
## 130493.15 171287.05 137406.80 210289.29 144445.68 201914.00 135797.46 270276.95 
##       983       993       999      1000      1002      1006      1007      1008 
## 186805.33 179552.00  92659.70 185047.04 100863.38 149837.34 172930.45  83510.21 
##      1009      1010      1012      1016      1017      1020      1028      1030 
## 288585.35 128183.01 121762.11 196603.86 237783.18 187648.99 257350.65 115740.23 
##      1031      1034      1036      1037      1039      1047      1063      1064 
## 132865.80 224382.00 118224.58 272184.82 120533.45 399042.84 137444.42 138077.17 
##      1066      1081      1086      1089      1092      1095      1097      1106 
## 291624.38 188852.85 189615.51 137804.40 168557.85 129273.64 131729.48 282831.71 
##      1107      1108      1111      1118      1120      1121      1123      1126 
## 150154.78 235539.00 180431.28 143620.07 132938.42 113352.11  97228.91 135255.03 
##      1130      1139      1142      1143      1156      1157      1162      1163 
## 130896.24 179495.67 181652.60 299268.02 197323.60 209360.58 193915.52 126443.61 
##      1166      1168      1180      1184      1187      1192      1196      1197 
## 259549.43 219049.09 101356.65 145628.34 151833.94 174782.12 181312.28 225265.26 
##      1199      1203      1204      1207      1208      1212      1213      1219 
## 194915.80 126402.05 237816.00 112837.03 240918.07 185935.52 105515.40  87037.23 
##      1221      1223      1226      1232      1235      1239      1243      1248 
## 126395.08 137184.40 138019.57 141398.62 126635.48 201258.98 153685.16 161500.33 
##      1250      1259      1261      1262      1263      1264      1270      1271 
## 118668.54 199959.13 232798.73 132651.51 162834.16 168035.86 135134.44 193918.42 
##      1273      1276      1277      1288      1289      1293      1296      1297 
## 147218.10 143034.11 174170.18 198261.92 244156.22 111914.82 136199.61 140380.47 
##      1301      1307      1309      1310      1311      1313      1316      1320 
## 221052.59 195028.54 153212.03 171784.97 261171.16 248844.23 163984.94 120505.10 
##      1321      1324      1325      1329      1331      1342      1343      1344 
## 146569.52  90154.86 285068.83 174440.29 248218.95 194428.68 265092.26 134369.54 
##      1345      1346      1352      1356      1357      1358      1361      1362 
## 196486.37  87092.37 152840.85 153547.35 126736.01 177316.97 149418.51 252372.15 
##      1363      1366      1371      1372      1376      1378      1380      1381 
## 148194.84 190306.18 136264.63 163019.63 263763.07 142451.35 190574.79  99375.36 
##      1388      1390      1395      1396      1398      1400      1401      1409 
## 128196.38 125634.85 197877.22 282755.32 131802.43 141986.18 129996.73 110678.66 
##      1410      1411      1414      1418      1423      1431      1432      1433 
## 232129.63 211225.04 307053.59 282720.58 144033.57 191665.90 136231.64 140498.28 
##      1438      1444      1445      1457 
## 346119.14 115957.24 201369.88 225904.95
LS0tCnRpdGxlOiAiUkYiCmF1dGhvcjogIk9zdmFsZG8gVGVsbG8gLSBBMDEyODU2NDIiCmRhdGU6ICIyMDI1LTAyLTI0IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IFRSVUUKICAgIHRvY19mbG9hdDogVFJVRQogICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgdGhlbWU6IGpvdXJuYWwKLS0tCgohW10oL1VzZXJzL29zdmFsZG90ZWxsby9EZXNrdG9wL0hkV3kuZ2lmKQoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IGdyZWVuOyI+VGVvcsOtYTwvc3Bhbj4KKipSYW5kb20gRm9yZXN0KiogZXMgdW4gYWxnb3JpdG1vIGRlIGFwcmVuZGl6YWplIGF1dG9tw6F0aWNvIHN1cGVydmlzYWRvIHF1ZSBzZSB1c2EgcGFyYSBjbGFzaWZpY2FyIHkvbyBoYWNlciByZWdyZXNpb25lcy4gU2UgYmFzYSBlbiBsYSBjcmVhY2nDs24gZGUgbcO6bHRpcGxlcyDDoXJib2xlcyBkZSBkZWNpc2nDs24geSBjb21iaW5hIHN1cyByZXN1bHRhZG9zIHBhcmEgaGFjZXIgcHJlZGljY2lvbmVzIG3DoXMgcHJlY2lzYXMgeSBlc3RhYmxlcy4KCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBncmVlbjsiPkluc3RhbGFyIHBhcXVldGVzIHkgbGxhbWFyIGxpYnJlcsOtYXM8L3NwYW4+CmBgYHtyfQojaW5zdGFsbC5wYWNrYWdlcygicmFuZG9tRm9yZXN0IikKbGlicmFyeShyYW5kb21Gb3Jlc3QpCiNpbnN0YWxsLnBhY2thZ2VzKCJjYXJldCIpICNlbnRyZW5hbWllbnRvIE1MCmxpYnJhcnkoY2FyZXQpCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IGdyZWVuOyI+SW1wb3J0YXIgYmFzZSBkZSBkYXRvczwvc3Bhbj4KYGBge3J9CmRmIDwtIHJlYWQuY3N2KCIvVXNlcnMvb3N2YWxkb3RlbGxvL0Rlc2t0b3AvSG91c2UgUHJpY2VzLmNzdiIpCmBgYAoKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBncmVlbjsiPkVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3M8L3NwYW4+CmBgYHtyfQpzdW1tYXJ5KGRmKQpoZWFkKGRmKQpkZiRNU1pvbmluZyA8LSBhcy5mYWN0b3IoZGYkTVNab25pbmcpCmRmJExvdENvbmZpZyA8LSBhcy5mYWN0b3IoZGYkTG90Q29uZmlnKQpkZiRCbGRnVHlwZSA8LSBhcy5mYWN0b3IoZGYkQmxkZ1R5cGUpCmRmJEV4dGVyaW9yMXN0IDwtIGFzLmZhY3RvcihkZiRFeHRlcmlvcjFzdCkKZGYkU2FsZVByaWNlIDwtIGFzLm51bWVyaWMoZGYkU2FsZVByaWNlKQpzdHIoZGYpCmRmIDwtIG5hLm9taXQoZGYpCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IGdyZWVuOyI+RW50cmVuYXIgZWwgbW9kZWxvPC9zcGFuPgpgYGB7cn0Kc2V0LnNlZWQoMTIzKQpyZW5nbG9uZXNfZW50cmVuYW1pZW50byA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKGRmJFNhbGVQcmljZSwgcD0wLjcsIGxpc3Q9RkFMU0UpCmVudHJlbmFtaWVudG8gPC0gZGZbcmVuZ2xvbmVzX2VudHJlbmFtaWVudG8sIF0KcHJ1ZWJhIDwtIGRmWy1yZW5nbG9uZXNfZW50cmVuYW1pZW50bywgXQptb2RlbG8gPC0gcmFuZG9tRm9yZXN0KFNhbGVQcmljZSB+IC4sIGRhdGE9ZW50cmVuYW1pZW50bywgbnRyZWU9MTAwKQpwcmludChtb2RlbG8pCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IGdyZWVuOyI+RXZhbHVhciBlbCBtb2RlbG88L3NwYW4+CmBgYHtyfQpldmFsdWFjaW9uX2VudHJlbmFtaWVudG8gPC0gcHJlZGljdChtb2RlbG8sIGVudHJlbmFtaWVudG8pCmV2YWx1YWNpb25fcHJ1ZWJhIDwtIHByZWRpY3QobW9kZWxvLCBwcnVlYmEpCiNtYXRyaXpfY29uZnVzaW9uX2VudHJlbmFtaWVudG8gPC0gY29uZnVzaW9uTWF0cml4KGV2YWx1YWNpb25fZW50cmVuYW1pZW50bywgZW50cmVuYW1pZW50byRTYWxlUHJpY2UpCiNtYXRyaXpfY29uZnVzaW9uX3BydWViYSA8LSBjb25mdXNpb25NYXRyaXgoZXZhbHVhY2lvbl9wcnVlYmEsIHBydWViYSRTYWxlUHJpY2UpCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IGdyZWVuOyI+R2VuZXJhciBwcmVkaWNjaW9uZXM8L3NwYW4+CmBgYHtyfQpwcmVkaWNjaW9uIDwtIHByZWRpY3QobW9kZWxvLCBwcnVlYmEpCnByZWRpY2Npb24KYGBgCgoK