Random Forest es un algoritmo de aprendizaje automático supervisado que se usa para clasificar y/o hacer regresiones. Se basa en la creación de múltiples árboles de decisión y combina sus resultados para hacer predicciones más precisas y estables.
#install.packages("randomForest")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
#install.packages("caret") #entrenamiento ML
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
df <- read.csv("/Users/osvaldotello/Desktop/House Prices.csv")
summary(df)
## Id MSSubClass MSZoning LotArea
## Min. : 0.0 Min. : 20.00 Length:2919 Min. : 1300
## 1st Qu.: 729.5 1st Qu.: 20.00 Class :character 1st Qu.: 7478
## Median :1459.0 Median : 50.00 Mode :character Median : 9453
## Mean :1459.0 Mean : 57.14 Mean : 10168
## 3rd Qu.:2188.5 3rd Qu.: 70.00 3rd Qu.: 11570
## Max. :2918.0 Max. :190.00 Max. :215245
##
## LotConfig BldgType OverallCond YearBuilt
## Length:2919 Length:2919 Min. :1.000 Min. :1872
## Class :character Class :character 1st Qu.:5.000 1st Qu.:1954
## Mode :character Mode :character Median :5.000 Median :1973
## Mean :5.565 Mean :1971
## 3rd Qu.:6.000 3rd Qu.:2001
## Max. :9.000 Max. :2010
##
## YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF
## Min. :1950 Length:2919 Min. : 0.00 Min. : 0.0
## 1st Qu.:1965 Class :character 1st Qu.: 0.00 1st Qu.: 793.0
## Median :1993 Mode :character Median : 0.00 Median : 989.5
## Mean :1984 Mean : 49.58 Mean :1051.8
## 3rd Qu.:2004 3rd Qu.: 0.00 3rd Qu.:1302.0
## Max. :2010 Max. :1526.00 Max. :6110.0
## NA's :1 NA's :1
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
## NA's :1459
head(df)
## Id MSSubClass MSZoning LotArea LotConfig BldgType OverallCond YearBuilt
## 1 0 60 RL 8450 Inside 1Fam 5 2003
## 2 1 20 RL 9600 FR2 1Fam 8 1976
## 3 2 60 RL 11250 Inside 1Fam 5 2001
## 4 3 70 RL 9550 Corner 1Fam 5 1915
## 5 4 60 RL 14260 FR2 1Fam 5 2000
## 6 5 50 RL 14115 Inside 1Fam 5 1993
## YearRemodAdd Exterior1st BsmtFinSF2 TotalBsmtSF SalePrice
## 1 2003 VinylSd 0 856 208500
## 2 1976 MetalSd 0 1262 181500
## 3 2002 VinylSd 0 920 223500
## 4 1970 Wd Sdng 0 756 140000
## 5 2000 VinylSd 0 1145 250000
## 6 1995 VinylSd 0 796 143000
df$MSZoning <- as.factor(df$MSZoning)
df$LotConfig <- as.factor(df$LotConfig)
df$BldgType <- as.factor(df$BldgType)
df$Exterior1st <- as.factor(df$Exterior1st)
df$SalePrice <- as.numeric(df$SalePrice)
str(df)
## 'data.frame': 2919 obs. of 13 variables:
## $ Id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 6 levels "","C (all)","FV",..: 5 5 5 5 5 5 5 5 6 5 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd: int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ Exterior1st : Factor w/ 16 levels "","AsbShng","AsphShn",..: 14 10 14 15 14 14 14 8 5 10 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ SalePrice : num 208500 181500 223500 140000 250000 ...
df <- na.omit(df)
set.seed(123)
renglones_entrenamiento <- createDataPartition(df$SalePrice, p=0.7, list=FALSE)
entrenamiento <- df[renglones_entrenamiento, ]
prueba <- df[-renglones_entrenamiento, ]
modelo <- randomForest(SalePrice ~ ., data=entrenamiento, ntree=100)
print(modelo)
##
## Call:
## randomForest(formula = SalePrice ~ ., data = entrenamiento, ntree = 100)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 1651910865
## % Var explained: 74.24
evaluacion_entrenamiento <- predict(modelo, entrenamiento)
evaluacion_prueba <- predict(modelo, prueba)
#matriz_confusion_entrenamiento <- confusionMatrix(evaluacion_entrenamiento, entrenamiento$SalePrice)
#matriz_confusion_prueba <- confusionMatrix(evaluacion_prueba, prueba$SalePrice)
prediccion <- predict(modelo, prueba)
prediccion
## 1 4 8 11 13 15 16 22
## 199809.90 145644.22 165499.91 140458.45 146105.26 154525.71 123443.38 103428.58
## 30 38 39 41 42 45 52 62
## 91032.27 154189.87 141982.59 138142.01 156877.53 149910.49 123428.29 125450.16
## 63 70 72 73 74 75 76 80
## 208845.84 168173.89 138146.71 199568.58 159280.23 112314.50 93399.10 129047.60
## 83 99 105 107 110 111 112 120
## 256561.68 103646.96 117309.19 154548.73 200705.33 131611.53 173580.10 176328.70
## 125 126 133 134 137 143 144 153
## 199092.36 121908.64 143763.76 175744.43 153001.83 117980.86 218462.00 175315.15
## 156 158 162 166 168 175 178 182
## 110581.92 222828.69 309861.31 161180.75 323204.28 221509.67 145694.74 138054.22
## 183 187 189 190 196 198 206 211
## 120402.78 208901.55 139127.51 249980.17 149034.91 241575.94 214484.77 114052.60
## 215 219 220 224 225 227 228 231
## 171414.57 209108.97 187350.57 129119.21 388513.46 254101.91 101826.66 139271.72
## 233 235 237 238 239 240 244 245
## 95609.88 196072.90 202015.90 224834.26 304001.84 107611.13 157844.22 200293.37
## 246 252 254 256 262 263 265 266
## 260357.88 243705.10 173563.56 194935.69 322393.32 155226.22 109946.43 186689.09
## 273 275 277 278 279 286 289 297
## 286272.31 134633.73 220596.53 124200.53 385428.54 167524.01 119313.28 134790.78
## 298 300 303 304 306 310 314 315
## 194571.86 177569.13 239998.56 136720.89 326686.37 324684.36 229164.35 143373.82
## 319 322 324 337 344 354 359 366
## 247582.97 274727.52 146559.61 367070.05 300656.44 110201.23 124199.02 160013.09
## 372 377 380 381 383 385 388 390
## 137247.01 191798.23 187433.82 128296.82 217326.64 325471.32 141480.12 341500.79
## 391 392 393 396 400 403 410 412
## 113489.87 203365.20 111317.84 139863.23 240598.24 143832.89 284156.29 166283.88
## 413 415 416 424 427 436 437 443
## 281913.75 244388.91 194668.17 266177.11 241946.77 205172.33 113906.47 127378.11
## 446 448 449 451 454 456 457 459
## 149454.71 206957.98 123700.82 111977.14 213696.37 153442.07 90749.29 126395.88
## 460 461 462 464 465 473 474 485
## 107827.77 214500.69 163654.45 234031.71 138927.99 144682.42 364769.98 138526.65
## 488 490 498 500 502 504 518 519
## 185546.76 121715.21 131400.96 135552.15 227176.04 225779.24 273288.82 198352.93
## 520 521 522 524 529 532 538 540
## 209458.88 139671.35 143731.66 460940.40 120521.77 127865.63 141270.65 231013.25
## 545 547 549 551 552 556 558 564
## 199803.45 138706.94 124189.95 138159.57 121747.49 112371.93 117678.96 185300.38
## 565 571 578 583 591 593 596 598
## 242056.88 158498.62 159366.42 193920.79 182959.39 134060.52 301759.81 193313.55
## 608 615 617 620 621 622 623 624
## 132631.31 93759.34 181344.52 290961.49 96555.01 215190.50 134654.64 172551.65
## 626 628 632 634 638 641 643 644
## 145365.26 149023.00 238905.68 153836.49 113485.09 269428.67 268473.96 151569.06
## 647 649 651 669 673 679 684 685
## 99052.36 129878.29 235696.47 206144.31 166948.58 351213.34 240629.48 289591.75
## 691 693 694 697 698 700 714 717
## 146826.66 261072.89 129201.38 89236.89 128293.17 210015.22 139958.25 173271.98
## 720 721 722 728 732 733 734 739
## 136660.12 227022.51 145976.16 187134.04 205532.20 272224.33 140337.84 208700.77
## 740 742 745 747 754 756 757 758
## 214535.27 132638.71 194173.54 193246.40 254765.55 171377.84 233844.45 161454.99
## 764 767 771 776 777 781 785 787
## 282669.17 195505.58 150346.49 163252.01 244687.22 180843.31 130537.27 126096.26
## 790 794 795 796 799 804 805 809
## 176141.71 268118.31 192813.28 147806.98 382088.31 373783.97 118035.41 156792.72
## 811 813 814 818 819 820 825 830
## 153338.59 79237.78 151502.54 270797.02 141972.01 215567.26 246767.72 157786.31
## 831 837 838 840 842 844 847 852
## 148284.95 155090.35 94112.16 150134.89 142148.13 151137.37 186708.28 192317.39
## 859 862 863 864 865 866 868 869
## 155659.07 136400.01 150282.41 142342.92 206873.66 133056.56 128948.26 164853.42
## 870 875 880 882 883 885 888 893
## 237333.41 110984.58 137781.10 218924.03 202916.63 119732.58 151759.18 143242.43
## 894 898 899 905 907 908 914 922
## 153234.42 122104.18 401171.01 116361.06 275759.30 213361.40 127610.33 166288.53
## 928 932 937 938 940 941 944 952
## 141975.65 131862.65 190891.27 266952.30 172641.25 189831.78 168257.41 119947.56
## 956 960 961 962 963 965 968 974
## 130493.15 171287.05 137406.80 210289.29 144445.68 201914.00 135797.46 270276.95
## 983 993 999 1000 1002 1006 1007 1008
## 186805.33 179552.00 92659.70 185047.04 100863.38 149837.34 172930.45 83510.21
## 1009 1010 1012 1016 1017 1020 1028 1030
## 288585.35 128183.01 121762.11 196603.86 237783.18 187648.99 257350.65 115740.23
## 1031 1034 1036 1037 1039 1047 1063 1064
## 132865.80 224382.00 118224.58 272184.82 120533.45 399042.84 137444.42 138077.17
## 1066 1081 1086 1089 1092 1095 1097 1106
## 291624.38 188852.85 189615.51 137804.40 168557.85 129273.64 131729.48 282831.71
## 1107 1108 1111 1118 1120 1121 1123 1126
## 150154.78 235539.00 180431.28 143620.07 132938.42 113352.11 97228.91 135255.03
## 1130 1139 1142 1143 1156 1157 1162 1163
## 130896.24 179495.67 181652.60 299268.02 197323.60 209360.58 193915.52 126443.61
## 1166 1168 1180 1184 1187 1192 1196 1197
## 259549.43 219049.09 101356.65 145628.34 151833.94 174782.12 181312.28 225265.26
## 1199 1203 1204 1207 1208 1212 1213 1219
## 194915.80 126402.05 237816.00 112837.03 240918.07 185935.52 105515.40 87037.23
## 1221 1223 1226 1232 1235 1239 1243 1248
## 126395.08 137184.40 138019.57 141398.62 126635.48 201258.98 153685.16 161500.33
## 1250 1259 1261 1262 1263 1264 1270 1271
## 118668.54 199959.13 232798.73 132651.51 162834.16 168035.86 135134.44 193918.42
## 1273 1276 1277 1288 1289 1293 1296 1297
## 147218.10 143034.11 174170.18 198261.92 244156.22 111914.82 136199.61 140380.47
## 1301 1307 1309 1310 1311 1313 1316 1320
## 221052.59 195028.54 153212.03 171784.97 261171.16 248844.23 163984.94 120505.10
## 1321 1324 1325 1329 1331 1342 1343 1344
## 146569.52 90154.86 285068.83 174440.29 248218.95 194428.68 265092.26 134369.54
## 1345 1346 1352 1356 1357 1358 1361 1362
## 196486.37 87092.37 152840.85 153547.35 126736.01 177316.97 149418.51 252372.15
## 1363 1366 1371 1372 1376 1378 1380 1381
## 148194.84 190306.18 136264.63 163019.63 263763.07 142451.35 190574.79 99375.36
## 1388 1390 1395 1396 1398 1400 1401 1409
## 128196.38 125634.85 197877.22 282755.32 131802.43 141986.18 129996.73 110678.66
## 1410 1411 1414 1418 1423 1431 1432 1433
## 232129.63 211225.04 307053.59 282720.58 144033.57 191665.90 136231.64 140498.28
## 1438 1444 1445 1457
## 346119.14 115957.24 201369.88 225904.95