##1.- EJEMPLOS DE ÁRBOLES DE DECISIÓN
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
str(Hitters)
## 'data.frame': 322 obs. of 20 variables:
## $ AtBat : int 293 315 479 496 321 594 185 298 323 401 ...
## $ Hits : int 66 81 130 141 87 169 37 73 81 92 ...
## $ HmRun : int 1 7 18 20 10 4 1 0 6 17 ...
## $ Runs : int 30 24 66 65 39 74 23 24 26 49 ...
## $ RBI : int 29 38 72 78 42 51 8 24 32 66 ...
## $ Walks : int 14 39 76 37 30 35 21 7 8 65 ...
## $ Years : int 1 14 3 11 2 11 2 3 2 13 ...
## $ CAtBat : int 293 3449 1624 5628 396 4408 214 509 341 5206 ...
## $ CHits : int 66 835 457 1575 101 1133 42 108 86 1332 ...
## $ CHmRun : int 1 69 63 225 12 19 1 0 6 253 ...
## $ CRuns : int 30 321 224 828 48 501 30 41 32 784 ...
## $ CRBI : int 29 414 266 838 46 336 9 37 34 890 ...
## $ CWalks : int 14 375 263 354 33 194 24 12 8 866 ...
## $ League : Factor w/ 2 levels "A","N": 1 2 1 2 2 1 2 1 2 1 ...
## $ Division : Factor w/ 2 levels "E","W": 1 2 2 1 1 2 1 2 2 1 ...
## $ PutOuts : int 446 632 880 200 805 282 76 121 143 0 ...
## $ Assists : int 33 43 82 11 40 421 127 283 290 0 ...
## $ Errors : int 20 10 14 3 4 25 7 9 19 0 ...
## $ Salary : num NA 475 480 500 91.5 750 70 100 75 1100 ...
## $ NewLeague: Factor w/ 2 levels "A","N": 1 2 1 2 2 1 1 1 2 1 ...
Primero eliminaremos las observaciones para la que no hay información sobre la variable respuesta Salary. Hay otras técnicas para manejar los valores ausentes, pero para este ejemplo se decide eliminarlos:
sum(is.na(Hitters$Salary))
## [1] 59
# Omisión de NAs y dimensión final de los datos
datos_Hitters <- na.omit(Hitters)
dim(datos_Hitters)
## [1] 263 20
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.3
skim(datos_Hitters)
| Name | datos_Hitters |
| Number of rows | 263 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 17 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| League | 0 | 1 | FALSE | 2 | A: 139, N: 124 |
| Division | 0 | 1 | FALSE | 2 | W: 134, E: 129 |
| NewLeague | 0 | 1 | FALSE | 2 | A: 141, N: 122 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| AtBat | 0 | 1 | 403.64 | 147.31 | 19.0 | 282.5 | 413 | 526.0 | 687 | ▁▇▇▇▆ |
| Hits | 0 | 1 | 107.83 | 45.13 | 1.0 | 71.5 | 103 | 141.5 | 238 | ▂▇▇▅▁ |
| HmRun | 0 | 1 | 11.62 | 8.76 | 0.0 | 5.0 | 9 | 18.0 | 40 | ▇▅▃▂▁ |
| Runs | 0 | 1 | 54.75 | 25.54 | 0.0 | 33.5 | 52 | 73.0 | 130 | ▃▇▇▃▁ |
| RBI | 0 | 1 | 51.49 | 25.88 | 0.0 | 30.0 | 47 | 71.0 | 121 | ▂▇▅▃▁ |
| Walks | 0 | 1 | 41.11 | 21.72 | 0.0 | 23.0 | 37 | 57.0 | 105 | ▅▇▅▃▁ |
| Years | 0 | 1 | 7.31 | 4.79 | 1.0 | 4.0 | 6 | 10.0 | 24 | ▇▆▃▂▁ |
| CAtBat | 0 | 1 | 2657.54 | 2286.58 | 19.0 | 842.5 | 1931 | 3890.5 | 14053 | ▇▃▁▁▁ |
| CHits | 0 | 1 | 722.19 | 648.20 | 4.0 | 212.0 | 516 | 1054.0 | 4256 | ▇▃▁▁▁ |
| CHmRun | 0 | 1 | 69.24 | 82.20 | 0.0 | 15.0 | 40 | 92.5 | 548 | ▇▁▁▁▁ |
| CRuns | 0 | 1 | 361.22 | 331.20 | 2.0 | 105.5 | 250 | 497.5 | 2165 | ▇▂▁▁▁ |
| CRBI | 0 | 1 | 330.42 | 323.37 | 3.0 | 95.0 | 230 | 424.5 | 1659 | ▇▃▁▁▁ |
| CWalks | 0 | 1 | 260.27 | 264.06 | 1.0 | 71.0 | 174 | 328.5 | 1566 | ▇▂▁▁▁ |
| PutOuts | 0 | 1 | 290.71 | 279.93 | 0.0 | 113.5 | 224 | 322.5 | 1377 | ▇▃▁▁▁ |
| Assists | 0 | 1 | 118.76 | 145.08 | 0.0 | 8.0 | 45 | 192.0 | 492 | ▇▂▂▁▁ |
| Errors | 0 | 1 | 8.59 | 6.61 | 0.0 | 3.0 | 7 | 13.0 | 32 | ▇▅▃▁▁ |
| Salary | 0 | 1 | 535.93 | 451.12 | 67.5 | 190.0 | 425 | 750.0 | 2460 | ▇▃▁▁▁ |
# Distribución variable respuesta
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
ggplot(data = datos_Hitters, aes(x = Salary)) +
geom_histogram(color = "blue", fill = "lightblue") +
labs(title = "Distribución Salary") +
theme(plot.title = element_text(hjust = 0.5))+
theme_bw() +
theme(legend.position = "bottom")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
La variable respuesta posee una distribución sesgada. Aplicaremos una transformación logarítmica para hacer su distribución más normal.
# Transformación logarítmica de la variable respuesta Salary
datos_Hitters$Salary <- log(datos_Hitters$Salary)
# Distribución variable respuesta
library(ggplot2)
ggplot(data = datos_Hitters, aes(x = Salary)) +
geom_histogram(color = "blue", fill = "lightblue") +
labs(title = "Distribución log[Salary]") +
theme(plot.title = element_text(hjust = 0.5))+
theme_bw() +
theme(legend.position = "bottom")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Antes de pasar a generar los modelos, dividimos el set de datos en un grupo de entrenamiento (para el ajuste de los modelos) y otro de test(para la evaluación de los mismos). Esta división dependerá de la cantidad de observaciones con las que contemos y la seguridad con la que queramos obtener la estimación del test error.
#Índice observaciones de entrenamiento
train <- 1:200
# Datos entrenamiento
datosH_train <- datos_Hitters[train, ]
#Datos prueba
datosH_test <- datos_Hitters[-train, ]
datosH_train
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
## -Alan Ashby 315 81 7 24 38 39 14 3449 835 69
## -Alvin Davis 479 130 18 66 72 76 3 1624 457 63
## -Andre Dawson 496 141 20 65 78 37 11 5628 1575 225
## -Andres Galarraga 321 87 10 39 42 30 2 396 101 12
## -Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19
## -Al Newman 185 37 1 23 8 21 2 214 42 1
## -Argenis Salazar 298 73 0 24 24 7 3 509 108 0
## -Andres Thomas 323 81 6 26 32 8 2 341 86 6
## -Andre Thornton 401 92 17 49 66 65 13 5206 1332 253
## -Alan Trammell 574 159 21 107 75 59 10 4631 1300 90
## -Alex Trevino 202 53 4 31 26 27 9 1876 467 15
## -Andy VanSlyke 418 113 13 48 61 47 4 1512 392 41
## -Alan Wiggins 239 60 0 30 11 22 6 1941 510 4
## -Bill Almon 196 43 7 29 27 30 13 3231 825 36
## -Buddy Bell 568 158 20 89 75 73 15 8068 2273 177
## -Buddy Biancalana 190 46 2 24 8 15 5 479 102 5
## -Bruce Bochy 127 32 8 16 22 14 8 727 180 24
## -Barry Bonds 413 92 16 72 48 65 1 413 92 16
## -Bobby Bonilla 426 109 3 55 43 62 1 426 109 3
## -Bob Brenly 472 116 16 60 62 74 6 1924 489 67
## -Bill Buckner 629 168 18 73 102 40 18 8424 2464 164
## -Brett Butler 587 163 4 92 51 70 6 2695 747 17
## -Bob Dernier 324 73 4 32 18 22 7 1931 491 13
## -Bo Diaz 474 129 10 50 56 40 10 2331 604 61
## -Bill Doran 550 152 6 92 37 81 5 2308 633 32
## -Brian Downing 513 137 20 90 95 90 14 5201 1382 166
## -Billy Hatcher 419 108 6 55 36 22 3 591 149 8
## -Brook Jacoby 583 168 17 83 80 56 5 1646 452 44
## -Bob Kearney 204 49 6 23 25 12 7 1309 308 27
## -Bill Madlock 379 106 10 38 60 30 14 6207 1906 146
## -Bob Melvin 268 60 5 24 25 15 2 350 78 5
## -BillyJo Robidoux 181 41 1 15 21 33 2 232 50 4
## -Bill Schroeder 217 46 7 32 19 9 4 694 160 32
## -Chris Bando 254 68 2 28 26 22 6 999 236 21
## -Chris Brown 416 132 7 57 49 33 3 932 273 24
## -Carmen Castillo 205 57 8 34 32 9 5 756 192 32
## -Chili Davis 526 146 13 71 70 84 6 2648 715 77
## -Carlton Fisk 457 101 14 42 63 22 17 6521 1767 281
## -Curt Ford 214 53 2 30 29 23 2 226 59 2
## -Carney Lansford 591 168 19 80 72 39 9 4478 1307 113
## -Chet Lemon 403 101 12 45 53 39 12 5150 1429 166
## -Candy Maldonado 405 102 18 49 85 20 6 950 231 29
## -Carmelo Martinez 244 58 9 28 25 35 4 1335 333 49
## -Craig Reynolds 313 78 6 32 41 12 12 3742 968 35
## -Cal Ripken 627 177 25 98 81 70 6 3210 927 133
## -Cory Snyder 416 113 24 58 69 16 1 416 113 24
## -Chris Speier 155 44 6 21 23 15 16 6631 1634 98
## -Curt Wilkerson 236 56 0 27 15 11 4 1115 270 1
## -Dave Anderson 216 53 1 31 15 22 4 926 210 9
## -Don Baylor 585 139 31 93 94 62 17 7546 1982 315
## -Daryl Boston 199 53 5 29 22 21 3 514 120 8
## -Darnell Coles 521 142 20 67 86 45 4 815 205 22
## -Dave Concepcion 311 81 3 42 30 26 17 8247 2198 100
## -Doug DeCinces 512 131 26 69 96 52 14 5347 1397 221
## -Darrell Evans 507 122 29 78 85 91 18 7761 1947 347
## -Dwight Evans 529 137 26 86 97 97 15 6661 1785 291
## -Damaso Garcia 424 119 6 57 46 13 9 3651 1046 32
## -Dan Gladden 351 97 4 55 29 39 4 1258 353 16
## -Dave Henderson 388 103 15 59 47 39 6 2174 555 80
## -Donnie Hill 339 96 4 37 29 23 4 1064 290 11
## -Davey Lopes 255 70 7 49 35 43 15 6311 1661 154
## -Don Mattingly 677 238 31 117 113 53 5 2223 737 93
## -Dale Murphy 614 163 29 89 83 75 11 5017 1388 266
## -Dwayne Murphy 329 83 9 50 39 56 9 3828 948 145
## -Dave Parker 637 174 31 89 116 56 14 6727 2024 247
## -Dan Pasqua 280 82 16 44 45 47 2 428 113 25
## -Darrell Porter 155 41 12 21 29 22 16 5409 1338 181
## -Dick Schofield 458 114 13 67 57 48 4 1350 298 28
## -Don Slaught 314 83 13 39 46 16 5 1457 405 28
## -Darryl Strawberry 475 123 27 76 93 72 4 1810 471 108
## -Dale Sveum 317 78 7 35 35 32 1 317 78 7
## -Danny Tartabull 511 138 25 76 96 61 3 592 164 28
## -Denny Walling 382 119 13 54 58 36 12 2133 594 41
## -Dave Winfield 565 148 24 90 104 77 14 7287 2083 305
## -Eric Davis 415 115 27 97 71 68 3 711 184 45
## -Eddie Milner 424 110 15 70 47 36 7 2130 544 38
## -Eddie Murray 495 151 17 61 84 78 10 5624 1679 275
## -Ed Romero 233 49 2 41 23 18 8 1350 336 7
## -Frank White 566 154 22 76 84 43 14 6100 1583 131
## -George Bell 641 198 31 101 108 41 5 2129 610 92
## -Glenn Braggs 215 51 4 19 18 11 1 215 51 4
## -George Brett 441 128 16 70 73 80 14 6675 2095 209
## -Greg Brock 325 76 16 33 52 37 5 1506 351 71
## -Gary Carter 490 125 24 81 105 62 13 6063 1646 271
## -Glenn Davis 574 152 31 91 101 64 3 985 260 53
## -Gary Gaetti 596 171 34 91 108 52 6 2862 728 107
## -Greg Gagne 472 118 12 63 54 30 4 793 187 14
## -George Hendrick 283 77 14 45 47 26 16 6840 1910 259
## -Glenn Hubbard 408 94 4 42 36 66 9 3573 866 59
## -Garth Iorg 327 85 3 30 44 20 8 2140 568 16
## -Gary Matthews 370 96 21 49 46 60 15 6986 1972 231
## -Graig Nettles 354 77 16 36 55 41 20 8716 2172 384
## -Gary Pettis 539 139 5 93 58 69 5 1469 369 12
## -Gary Redus 340 84 11 62 33 47 5 1516 376 42
## -Garry Templeton 510 126 2 42 44 35 11 5562 1578 44
## -Greg Walker 282 78 13 37 51 29 5 1649 453 73
## -Gary Ward 380 120 5 54 51 31 8 3118 900 92
## -Glenn Wilson 584 158 15 70 84 42 5 2358 636 58
## -Harold Baines 570 169 21 72 88 38 7 3754 1077 140
## -Hubie Brooks 306 104 14 50 58 25 7 2954 822 55
## -Howard Johnson 220 54 10 30 39 31 5 1185 299 40
## -Hal McRae 278 70 7 22 37 18 18 7186 2081 190
## -Harold Reynolds 445 99 1 46 24 29 4 618 129 1
## -Harry Spilman 143 39 5 18 30 15 9 639 151 16
## -Herm Winningham 185 40 4 23 11 18 3 524 125 7
## -Jesse Barfield 589 170 40 107 108 69 6 2325 634 128
## -Juan Beniquez 343 103 6 48 36 40 15 4338 1193 70
## -John Cangelosi 438 103 2 65 32 71 2 440 103 2
## -Jose Canseco 600 144 33 85 117 65 2 696 173 38
## -Joe Carter 663 200 29 108 121 32 4 1447 404 57
## -Jack Clark 232 55 9 34 23 45 12 4405 1213 194
## -Jose Cruz 479 133 10 48 72 55 17 7472 2147 153
## -Jody Davis 528 132 21 61 74 41 6 2641 671 97
## -Jim Dwyer 160 39 8 18 31 22 14 2128 543 56
## -Julio Franco 599 183 10 80 74 32 5 2482 715 27
## -Jim Gantner 497 136 7 58 38 26 11 3871 1066 40
## -Johnny Grubb 210 70 13 32 51 28 15 4040 1130 97
## -Jack Howell 151 41 4 26 21 19 2 288 68 9
## -John Kruk 278 86 4 33 38 45 1 278 86 4
## -Jeffrey Leonard 341 95 6 48 42 20 10 2964 808 81
## -Jim Morrison 537 147 23 58 88 47 10 2744 730 97
## -John Moses 399 102 3 56 34 34 5 670 167 4
## -Jerry Mumphrey 309 94 5 37 32 26 13 4618 1330 57
## -Jim Presley 616 163 27 83 107 32 3 1437 377 65
## -Johnny Ray 579 174 7 67 78 58 6 3053 880 32
## -Jeff Reed 165 39 2 13 9 16 3 196 44 2
## -Jim Rice 618 200 20 98 110 62 13 7127 2163 351
## -Jerry Royster 257 66 5 31 26 32 14 3910 979 33
## -John Russell 315 76 13 35 60 25 3 630 151 24
## -Juan Samuel 591 157 16 90 78 26 4 2020 541 52
## -John Shelby 404 92 11 54 49 18 6 1354 325 30
## -Joel Skinner 315 73 5 23 37 16 4 450 108 6
## -Jim Sundberg 429 91 12 41 42 57 13 5590 1397 83
## -Jose Uribe 453 101 3 46 43 61 3 948 218 6
## -Joel Youngblood 184 47 5 20 28 18 11 3327 890 74
## -Kevin Bass 591 184 20 83 79 38 5 1689 462 40
## -Kal Daniels 181 58 6 34 23 22 1 181 58 6
## -Kirk Gibson 441 118 28 84 86 68 8 2723 750 126
## -Ken Griffey 490 150 21 69 58 35 14 6126 1839 121
## -Keith Hernandez 551 171 13 94 83 94 13 6090 1840 128
## -Kent Hrbek 550 147 29 85 91 71 6 2816 815 117
## -Ken Landreaux 283 74 4 34 29 22 10 3919 1062 85
## -Kevin McReynolds 560 161 26 89 96 66 4 1789 470 65
## -Kevin Mitchell 328 91 12 51 43 33 2 342 94 12
## -Keith Moreland 586 159 12 72 79 53 9 3082 880 83
## -Ken Oberkfell 503 136 5 62 48 83 10 3423 970 20
## -Ken Phelps 344 85 24 69 64 88 7 911 214 64
## -Kirby Puckett 680 223 31 119 96 34 3 1928 587 35
## -Kurt Stillwell 279 64 0 31 26 30 1 279 64 0
## -Leon Durham 484 127 20 66 65 67 7 3006 844 116
## -Len Dykstra 431 127 8 77 45 58 2 667 187 9
## -Larry Herndon 283 70 8 33 37 27 12 4479 1222 94
## -Lee Lacy 491 141 11 77 47 37 15 4291 1240 84
## -Len Matuszek 199 52 9 26 28 21 6 805 191 30
## -Lloyd Moseby 589 149 21 89 86 64 7 3558 928 102
## -Lance Parrish 327 84 22 53 62 38 10 4273 1123 212
## -Larry Parrish 464 128 28 67 94 52 13 5829 1552 210
## -Larry Sheets 338 92 18 42 60 21 3 682 185 36
## -Lou Whitaker 584 157 20 95 73 63 10 4704 1320 93
## -Mike Aldrete 216 54 2 27 25 33 1 216 54 2
## -Marty Barrett 625 179 4 94 60 65 5 1696 476 12
## -Mike Davis 489 131 19 77 55 34 7 2051 549 62
## -Mike Diaz 209 56 12 22 36 19 2 216 58 12
## -Mariano Duncan 407 93 8 47 30 30 2 969 230 14
## -Mike Easler 490 148 14 64 78 49 13 3400 1000 113
## -Mel Hall 442 131 18 68 77 33 6 1416 398 47
## -Mike Heath 288 65 8 30 36 27 9 2815 698 55
## -Mike Kingery 209 54 3 25 14 12 1 209 54 3
## -Mike LaValliere 303 71 3 18 30 36 3 344 76 3
## -Mike Marshall 330 77 19 47 53 27 6 1928 516 90
## -Mike Pagliarulo 504 120 28 71 71 54 3 1085 259 54
## -Mark Salas 258 60 8 28 33 18 3 638 170 17
## -Mike Schmidt 20 1 0 0 0 0 2 41 9 2
## -Mike Scioscia 374 94 5 36 26 62 7 1968 519 26
## -Mickey Tettleton 211 43 10 26 35 39 3 498 116 14
## -Milt Thompson 299 75 6 38 23 26 3 580 160 8
## -Mitch Webster 576 167 8 89 49 57 4 822 232 19
## -Mookie Wilson 381 110 9 61 45 32 7 3015 834 40
## -Marvell Wynne 288 76 7 34 37 15 4 1644 408 16
## -Mike Young 369 93 9 43 42 49 5 1258 323 54
## -Ozzie Guillen 547 137 2 58 47 12 2 1038 271 3
## -Oddibe McDowell 572 152 18 105 49 65 2 978 249 36
## -Ozzie Smith 514 144 0 67 54 79 9 4739 1169 13
## -Ozzie Virgil 359 80 15 45 48 63 7 1493 359 61
## -Phil Bradley 526 163 12 88 50 77 4 1556 470 38
## -Phil Garner 313 83 9 43 41 30 14 5885 1543 104
## -Pete Incaviglia 540 135 30 82 88 55 1 540 135 30
## -Paul Molitor 437 123 9 62 55 40 9 4139 1203 79
## -Pete Rose 237 52 0 15 25 30 24 14053 4256 160
## -Pat Sheridan 236 56 6 41 19 21 5 1257 329 24
## -Pat Tabler 473 154 6 61 48 29 6 1966 566 29
## -Rafael Belliard 309 72 0 33 31 26 5 354 82 0
## -Rick Burleson 271 77 5 35 29 33 12 4933 1358 48
## -Randy Bush 357 96 7 50 45 39 5 1394 344 43
## -Rick Cerone 216 56 4 22 18 15 12 2796 665 43
## -Ron Cey 256 70 13 42 36 44 16 7058 1845 312
## -Rob Deer 466 108 33 75 86 72 3 652 142 44
## -Rick Dempsey 327 68 13 42 29 45 18 3949 939 78
## -Ron Hassey 341 110 9 45 49 46 9 2331 658 50
## -Rickey Henderson 608 160 28 130 74 89 8 4071 1182 103
## CRuns CRBI CWalks League Division PutOuts Assists Errors
## -Alan Ashby 321 414 375 N W 632 43 10
## -Alvin Davis 224 266 263 A W 880 82 14
## -Andre Dawson 828 838 354 N E 200 11 3
## -Andres Galarraga 48 46 33 N E 805 40 4
## -Alfredo Griffin 501 336 194 A W 282 421 25
## -Al Newman 30 9 24 N E 76 127 7
## -Argenis Salazar 41 37 12 A W 121 283 9
## -Andres Thomas 32 34 8 N W 143 290 19
## -Andre Thornton 784 890 866 A E 0 0 0
## -Alan Trammell 702 504 488 A E 238 445 22
## -Alex Trevino 192 186 161 N W 304 45 11
## -Andy VanSlyke 205 204 203 N E 211 11 7
## -Alan Wiggins 309 103 207 A E 121 151 6
## -Bill Almon 376 290 238 N E 80 45 8
## -Buddy Bell 1045 993 732 N W 105 290 10
## -Buddy Biancalana 65 23 39 A W 102 177 16
## -Bruce Bochy 67 82 56 N W 202 22 2
## -Barry Bonds 72 48 65 N E 280 9 5
## -Bobby Bonilla 55 43 62 A W 361 22 2
## -Bob Brenly 242 251 240 N W 518 55 3
## -Bill Buckner 1008 1072 402 A E 1067 157 14
## -Brett Butler 442 198 317 A E 434 9 3
## -Bob Dernier 291 108 180 N E 222 3 3
## -Bo Diaz 246 327 166 N W 732 83 13
## -Bill Doran 349 182 308 N W 262 329 16
## -Brian Downing 763 734 784 A W 267 5 3
## -Billy Hatcher 80 46 31 N W 226 7 4
## -Brook Jacoby 219 208 136 A E 109 292 25
## -Bob Kearney 126 132 66 A W 419 46 5
## -Bill Madlock 859 803 571 N W 72 170 24
## -Bob Melvin 34 29 18 N W 442 59 6
## -BillyJo Robidoux 20 29 45 A E 326 29 5
## -Bill Schroeder 86 76 32 A E 307 25 1
## -Chris Bando 108 117 118 A E 359 30 4
## -Chris Brown 113 121 80 N W 73 177 18
## -Carmen Castillo 117 107 51 A E 58 4 4
## -Chili Davis 352 342 289 N W 303 9 9
## -Carlton Fisk 1003 977 619 A W 389 39 4
## -Curt Ford 32 32 27 N E 109 7 3
## -Carney Lansford 634 563 319 A W 67 147 4
## -Chet Lemon 747 666 526 A E 316 6 5
## -Candy Maldonado 99 138 64 N W 161 10 3
## -Carmelo Martinez 164 179 194 N W 142 14 2
## -Craig Reynolds 409 321 170 N W 106 206 7
## -Cal Ripken 529 472 313 A E 240 482 13
## -Cory Snyder 58 69 16 A E 203 70 10
## -Chris Speier 698 661 777 N E 53 88 3
## -Curt Wilkerson 116 64 57 A W 125 199 13
## -Dave Anderson 118 69 114 N W 73 152 11
## -Don Baylor 1141 1179 727 A E 0 0 0
## -Daryl Boston 57 40 39 A W 152 3 5
## -Darnell Coles 99 103 78 A E 107 242 23
## -Dave Concepcion 950 909 690 N W 153 223 10
## -Doug DeCinces 712 815 548 A W 119 216 12
## -Darrell Evans 1175 1152 1380 A E 808 108 2
## -Dwight Evans 1082 949 989 A E 280 10 5
## -Damaso Garcia 461 301 112 A E 224 286 8
## -Dan Gladden 196 110 117 N W 226 7 3
## -Dave Henderson 285 274 186 A W 182 9 4
## -Donnie Hill 123 108 55 A W 104 213 9
## -Davey Lopes 1019 608 820 N E 51 54 8
## -Don Mattingly 349 401 171 A E 1377 100 6
## -Dale Murphy 813 822 617 N W 303 6 6
## -Dwayne Murphy 575 528 635 A W 276 6 2
## -Dave Parker 978 1093 495 N W 278 9 9
## -Dan Pasqua 61 70 63 A E 148 4 2
## -Darrell Porter 746 805 875 A W 165 9 1
## -Dick Schofield 160 123 122 A W 246 389 18
## -Don Slaught 156 159 76 A W 533 40 4
## -Darryl Strawberry 292 343 267 N E 226 10 6
## -Dale Sveum 35 35 32 A E 45 122 26
## -Danny Tartabull 87 110 71 A W 157 7 8
## -Denny Walling 287 294 227 N W 59 156 9
## -Dave Winfield 1135 1234 791 A E 292 9 5
## -Eric Davis 156 119 99 N W 274 2 7
## -Eddie Milner 335 174 258 N W 292 6 3
## -Eddie Murray 884 1015 709 A E 1045 88 13
## -Ed Romero 166 122 106 A E 102 132 10
## -Frank White 743 693 300 A W 316 439 10
## -George Bell 297 319 117 A E 269 17 10
## -Glenn Braggs 19 18 11 A E 116 5 12
## -George Brett 1072 1050 695 A W 97 218 16
## -Greg Brock 195 219 214 N W 726 87 3
## -Gary Carter 847 999 680 N E 869 62 8
## -Glenn Davis 148 173 95 N W 1253 111 11
## -Gary Gaetti 361 401 224 A W 118 334 21
## -Greg Gagne 102 80 50 A W 228 377 26
## -George Hendrick 915 1067 546 A W 144 6 5
## -Glenn Hubbard 429 365 410 N W 282 487 19
## -Garth Iorg 216 208 93 A E 91 185 12
## -Gary Matthews 1070 955 921 N E 137 5 9
## -Graig Nettles 1172 1267 1057 N W 83 174 16
## -Gary Pettis 247 126 198 A W 462 9 7
## -Gary Redus 284 141 219 N E 185 8 4
## -Garry Templeton 703 519 256 N W 207 358 20
## -Greg Walker 211 280 138 A W 670 57 5
## -Gary Ward 444 419 240 A W 237 8 1
## -Glenn Wilson 265 316 134 N E 331 20 4
## -Harold Baines 492 589 263 A W 295 15 5
## -Hubie Brooks 313 377 187 N E 116 222 15
## -Howard Johnson 145 154 128 N E 50 136 20
## -Hal McRae 935 1088 643 A W 0 0 0
## -Harold Reynolds 72 31 48 A W 278 415 16
## -Harry Spilman 80 97 61 N W 138 15 1
## -Herm Winningham 58 37 47 N E 97 2 2
## -Jesse Barfield 371 376 238 A E 368 20 3
## -Juan Beniquez 581 421 325 A E 211 56 13
## -John Cangelosi 67 32 71 A W 276 7 9
## -Jose Canseco 101 130 69 A W 319 4 14
## -Joe Carter 210 222 68 A E 241 8 6
## -Jack Clark 702 705 625 N E 623 35 3
## -Jose Cruz 980 1032 854 N W 237 5 4
## -Jody Davis 273 383 226 N E 885 105 8
## -Jim Dwyer 304 268 298 A E 33 3 0
## -Julio Franco 330 326 158 A E 231 374 18
## -Jim Gantner 450 367 241 A E 304 347 10
## -Johnny Grubb 544 462 551 A E 0 0 0
## -Jack Howell 45 39 35 A W 28 56 2
## -John Kruk 33 38 45 N W 102 4 2
## -Jeffrey Leonard 379 428 221 N W 158 4 5
## -Jim Morrison 302 351 174 N E 92 257 20
## -John Moses 89 48 54 A W 211 9 3
## -Jerry Mumphrey 616 522 436 N E 161 3 3
## -Jim Presley 181 227 82 A W 110 308 15
## -Johnny Ray 366 337 218 N E 280 479 5
## -Jeff Reed 18 10 18 A W 332 19 2
## -Jim Rice 1104 1289 564 A E 330 16 8
## -Jerry Royster 518 324 382 N W 87 166 14
## -John Russell 68 94 55 N E 498 39 13
## -Juan Samuel 310 226 91 N E 290 440 25
## -John Shelby 188 135 63 A E 222 5 5
## -Joel Skinner 38 46 28 A W 227 15 3
## -Jim Sundberg 578 579 644 A W 686 46 4
## -Jose Uribe 96 72 91 N W 249 444 16
## -Joel Youngblood 419 382 304 N W 49 2 0
## -Kevin Bass 219 195 82 N W 303 12 5
## -Kal Daniels 34 23 22 N W 88 0 3
## -Kirk Gibson 433 420 309 A E 190 2 2
## -Ken Griffey 983 707 600 A E 96 5 3
## -Keith Hernandez 969 900 917 N E 1199 149 5
## -Kent Hrbek 405 474 319 A W 1218 104 10
## -Ken Landreaux 505 456 283 N W 145 5 7
## -Kevin McReynolds 233 260 155 N W 332 9 8
## -Kevin Mitchell 51 44 33 N E 145 59 8
## -Keith Moreland 363 477 295 N E 181 13 4
## -Ken Oberkfell 408 303 414 N W 65 258 8
## -Ken Phelps 150 156 187 A W 0 0 0
## -Kirby Puckett 262 201 91 A W 429 8 6
## -Kurt Stillwell 31 26 30 N W 107 205 16
## -Leon Durham 436 458 377 N E 1231 80 7
## -Len Dykstra 117 64 88 N E 283 8 3
## -Larry Herndon 557 483 307 A E 156 2 2
## -Lee Lacy 615 430 340 A E 239 8 2
## -Len Matuszek 113 119 87 N W 235 22 5
## -Lloyd Moseby 513 471 351 A E 371 6 6
## -Lance Parrish 577 700 334 A E 483 48 6
## -Larry Parrish 740 840 452 A W 0 0 0
## -Larry Sheets 88 112 50 A E 0 0 0
## -Lou Whitaker 724 522 576 A E 276 421 11
## -Mike Aldrete 27 25 33 N W 317 36 1
## -Marty Barrett 216 163 166 A E 303 450 14
## -Mike Davis 300 263 153 A W 310 9 9
## -Mike Diaz 24 37 19 N E 201 6 3
## -Mariano Duncan 121 69 68 N W 172 317 25
## -Mike Easler 445 491 301 A E 0 0 0
## -Mel Hall 210 203 136 A E 233 7 7
## -Mike Heath 315 325 189 N E 259 30 10
## -Mike Kingery 25 14 12 A W 102 6 3
## -Mike LaValliere 20 36 45 N E 468 47 6
## -Mike Marshall 247 288 161 N W 149 8 6
## -Mike Pagliarulo 150 167 114 A E 103 283 19
## -Mark Salas 80 75 36 A W 358 32 8
## -Mike Schmidt 6 7 4 N E 78 220 6
## -Mike Scioscia 181 199 288 N W 756 64 15
## -Mickey Tettleton 59 55 78 A W 463 32 8
## -Milt Thompson 71 33 44 N E 212 1 2
## -Mitch Webster 132 83 79 N E 325 12 8
## -Mookie Wilson 451 249 168 N E 228 7 5
## -Marvell Wynne 198 120 113 N W 203 3 3
## -Mike Young 181 177 157 A E 149 1 6
## -Ozzie Guillen 129 80 24 A W 261 459 22
## -Oddibe McDowell 168 91 101 A W 325 13 3
## -Ozzie Smith 583 374 528 N E 229 453 15
## -Ozzie Virgil 176 202 175 N W 682 93 13
## -Phil Bradley 245 167 174 A W 250 11 1
## -Phil Garner 751 714 535 N W 58 141 23
## -Pete Incaviglia 82 88 55 A W 157 6 14
## -Paul Molitor 676 390 364 A E 82 170 15
## -Pete Rose 2165 1314 1566 N W 523 43 6
## -Pat Sheridan 166 125 105 A E 172 1 4
## -Pat Tabler 250 252 178 A E 846 84 9
## -Rafael Belliard 41 32 26 N E 117 269 12
## -Rick Burleson 630 435 403 A W 62 90 3
## -Randy Bush 178 192 136 A W 167 2 4
## -Rick Cerone 266 304 198 A E 391 44 4
## -Ron Cey 965 1128 990 N E 41 118 8
## -Rob Deer 102 109 102 A E 286 8 8
## -Rick Dempsey 438 380 466 A E 659 53 7
## -Ron Hassey 249 322 274 A E 251 9 4
## -Rickey Henderson 862 417 708 A E 426 4 6
## Salary NewLeague
## -Alan Ashby 6.163315 N
## -Alvin Davis 6.173786 A
## -Andre Dawson 6.214608 N
## -Andres Galarraga 4.516339 N
## -Alfredo Griffin 6.620073 A
## -Al Newman 4.248495 A
## -Argenis Salazar 4.605170 A
## -Andres Thomas 4.317488 N
## -Andre Thornton 7.003065 A
## -Alan Trammell 6.248319 A
## -Alex Trevino 6.239301 N
## -Andy VanSlyke 6.309918 N
## -Alan Wiggins 6.551080 A
## -Bill Almon 5.480639 N
## -Buddy Bell 6.652863 N
## -Buddy Biancalana 5.164786 A
## -Bruce Bochy 4.905275 N
## -Barry Bonds 4.605170 N
## -Bobby Bonilla 4.744932 N
## -Bob Brenly 6.396930 N
## -Bill Buckner 6.655012 A
## -Brett Butler 6.639876 A
## -Bob Dernier 6.562914 N
## -Bo Diaz 6.620073 N
## -Bill Doran 6.437752 N
## -Brian Downing 6.802395 A
## -Billy Hatcher 4.700480 N
## -Brook Jacoby 6.417549 A
## -Bob Kearney 5.703782 A
## -Bill Madlock 6.745236 N
## -Bob Melvin 4.499810 N
## -BillyJo Robidoux 4.212128 A
## -Bill Schroeder 5.192957 A
## -Chris Bando 5.720312 A
## -Chris Brown 5.370638 N
## -Carmen Castillo 5.511411 A
## -Chili Davis 6.703188 N
## -Carlton Fisk 6.774224 A
## -Curt Ford 4.248495 N
## -Carney Lansford 7.090077 A
## -Chet Lemon 6.514713 A
## -Candy Maldonado 6.028279 N
## -Carmelo Martinez 5.828946 N
## -Craig Reynolds 6.032287 N
## -Cal Ripken 7.207860 A
## -Cory Snyder 4.499810 A
## -Chris Speier 5.616771 N
## -Curt Wilkerson 5.438079 A
## -Dave Anderson 5.416100 N
## -Don Baylor 6.856462 A
## -Daryl Boston 4.317488 A
## -Darnell Coles 4.653960 A
## -Dave Concepcion 5.768321 N
## -Doug DeCinces 6.745236 A
## -Darrell Evans 6.282267 A
## -Dwight Evans 6.838762 A
## -Damaso Garcia 6.745236 N
## -Dan Gladden 5.347108 A
## -Dave Henderson 5.783825 A
## -Donnie Hill 5.616771 A
## -Davey Lopes 6.109248 N
## -Don Mattingly 7.588324 A
## -Dale Murphy 7.549609 N
## -Dwayne Murphy 6.396930 A
## -Dave Parker 6.948578 N
## -Dan Pasqua 4.700480 A
## -Darrell Porter 5.560682 A
## -Dick Schofield 6.163315 A
## -Don Slaught 6.067268 A
## -Darryl Strawberry 7.106606 N
## -Dale Sveum 4.248495 A
## -Danny Tartabull 4.976734 A
## -Denny Walling 6.388561 N
## -Dave Winfield 7.529116 A
## -Eric Davis 5.703782 N
## -Eddie Milner 6.194405 N
## -Eddie Murray 7.807917 A
## -Ed Romero 5.926926 A
## -Frank White 6.620073 A
## -George Bell 7.069023 A
## -Glenn Braggs 4.248495 A
## -George Brett 7.313220 A
## -Greg Brock 5.953243 A
## -Gary Carter 7.562978 N
## -Glenn Davis 5.370638 N
## -Gary Gaetti 6.802395 A
## -Greg Gagne 5.043425 A
## -George Hendrick 6.551080 A
## -Glenn Hubbard 6.282267 N
## -Garth Iorg 5.893024 A
## -Gary Matthews 6.597600 N
## -Graig Nettles 5.298317 N
## -Gary Pettis 5.991465 A
## -Gary Redus 5.991465 A
## -Garry Templeton 6.603266 N
## -Greg Walker 6.214608 A
## -Gary Ward 6.396930 A
## -Glenn Wilson 6.496021 N
## -Harold Baines 6.856462 A
## -Hubie Brooks 6.620073 N
## -Howard Johnson 5.695414 N
## -Hal McRae 5.783825 A
## -Harold Reynolds 4.471639 A
## -Harry Spilman 5.164786 N
## -Herm Winningham 4.499810 N
## -Jesse Barfield 7.120848 A
## -Juan Beniquez 6.063785 A
## -John Cangelosi 4.605170 N
## -Jose Canseco 5.105945 A
## -Joe Carter 5.521461 A
## -Jack Clark 7.170120 N
## -Jose Cruz 6.650710 N
## -Jody Davis 6.916054 N
## -Jim Dwyer 5.616771 A
## -Julio Franco 6.652863 A
## -Jim Gantner 6.745236 A
## -Johnny Grubb 5.899897 A
## -Jack Howell 4.553877 A
## -John Kruk 4.700480 N
## -Jeffrey Leonard 4.605170 N
## -Jim Morrison 5.625821 N
## -John Moses 4.382027 A
## -Jerry Mumphrey 6.396930 N
## -Jim Presley 5.298317 A
## -Johnny Ray 6.487684 N
## -Jeff Reed 4.317488 N
## -Jim Rice 7.788419 A
## -Jerry Royster 5.521461 A
## -John Russell 5.043425 N
## -Juan Samuel 6.461468 N
## -John Shelby 5.703782 A
## -Joel Skinner 4.700480 A
## -Jim Sundberg 6.715383 N
## -Jose Uribe 5.273000 N
## -Joel Youngblood 6.109248 N
## -Kevin Bass 6.445720 N
## -Kal Daniels 4.460144 N
## -Kirk Gibson 7.170120 A
## -Ken Griffey 6.907755 N
## -Keith Hernandez 7.495542 N
## -Kent Hrbek 7.177782 A
## -Ken Landreaux 6.603266 N
## -Kevin McReynolds 6.437752 N
## -Kevin Mitchell 4.828314 N
## -Keith Moreland 6.950176 N
## -Ken Oberkfell 6.586172 N
## -Ken Phelps 5.703782 A
## -Kirby Puckett 5.899897 A
## -Kurt Stillwell 4.317488 N
## -Leon Durham 7.076090 N
## -Len Dykstra 5.310740 N
## -Larry Herndon 5.416100 A
## -Lee Lacy 6.263398 A
## -Len Matuszek 5.579730 N
## -Lloyd Moseby 6.668863 A
## -Lance Parrish 6.684612 N
## -Larry Parrish 6.375876 A
## -Larry Sheets 4.976734 A
## -Lou Whitaker 6.040255 A
## -Mike Aldrete 4.317488 N
## -Marty Barrett 6.354370 A
## -Mike Davis 6.659294 A
## -Mike Diaz 4.499810 N
## -Mariano Duncan 5.010635 N
## -Mike Easler 6.551080 N
## -Mel Hall 6.309918 A
## -Mike Heath 6.476972 A
## -Mike Kingery 4.219508 A
## -Mike LaValliere 4.605170 N
## -Mike Marshall 6.507278 N
## -Mike Pagliarulo 5.164786 A
## -Mark Salas 4.919981 A
## -Mike Schmidt 7.662624 N
## -Mike Scioscia 6.774224 N
## -Mickey Tettleton 4.787492 A
## -Milt Thompson 4.941642 N
## -Mitch Webster 5.347108 N
## -Mookie Wilson 6.684612 N
## -Marvell Wynne 5.480639 N
## -Mike Young 5.857933 A
## -Ozzie Guillen 5.164786 A
## -Oddibe McDowell 5.298317 A
## -Ozzie Smith 7.570443 N
## -Ozzie Virgil 6.551080 N
## -Phil Bradley 6.620073 A
## -Phil Garner 6.109248 N
## -Pete Incaviglia 5.147494 A
## -Paul Molitor 7.138867 A
## -Pete Rose 6.620073 N
## -Pat Sheridan 5.247024 A
## -Pat Tabler 6.363028 A
## -Rafael Belliard 4.867534 N
## -Rick Burleson 6.109248 A
## -Randy Bush 5.703782 A
## -Rick Cerone 5.521461 A
## -Ron Cey 6.956545 A
## -Rob Deer 5.370638 A
## -Rick Dempsey 5.991465 A
## -Ron Hassey 6.327937 A
## -Rickey Henderson 7.420579 A
summary(datosH_train)
## AtBat Hits HmRun Runs
## Min. : 20.0 Min. : 1.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:282.8 1st Qu.: 71.75 1st Qu.: 5.00 1st Qu.: 33.00
## Median :402.0 Median :101.50 Median :10.00 Median : 50.00
## Mean :396.6 Mean :106.07 Mean :12.47 Mean : 54.77
## 3rd Qu.:513.2 3rd Qu.:139.50 3rd Qu.:19.00 3rd Qu.: 73.25
## Max. :680.0 Max. :238.00 Max. :40.00 Max. :130.00
## RBI Walks Years CAtBat
## Min. : 0.00 Min. : 0.00 Min. : 1.00 Min. : 41.0
## 1st Qu.: 31.00 1st Qu.:23.00 1st Qu.: 4.00 1st Qu.: 812.5
## Median : 47.50 Median :36.50 Median : 6.00 Median : 1967.0
## Mean : 52.95 Mean :41.05 Mean : 7.53 Mean : 2731.4
## 3rd Qu.: 74.00 3rd Qu.:58.00 3rd Qu.:11.00 3rd Qu.: 4088.0
## Max. :121.00 Max. :97.00 Max. :24.00 Max. :14053.0
## CHits CHmRun CRuns CRBI
## Min. : 9.0 Min. : 0.00 Min. : 6.0 Min. : 7.0
## 1st Qu.: 201.8 1st Qu.: 16.00 1st Qu.: 111.8 1st Qu.: 101.5
## Median : 542.0 Median : 44.00 Median : 263.5 Median : 250.0
## Mean : 742.3 Mean : 74.46 Mean : 373.4 Mean : 346.6
## 3rd Qu.:1130.8 3rd Qu.: 97.25 3rd Qu.: 561.5 3rd Qu.: 471.2
## Max. :4256.0 Max. :384.00 Max. :2165.0 Max. :1314.0
## CWalks League Division PutOuts Assists
## Min. : 4.00 A:109 E: 97 Min. : 0.0 Min. : 0.0
## 1st Qu.: 68.75 N: 91 W:103 1st Qu.: 114.5 1st Qu.: 7.0
## Median : 176.50 Median : 226.5 Median : 33.5
## Mean : 270.70 Mean : 281.6 Mean :100.0
## 3rd Qu.: 356.50 3rd Qu.: 317.5 3rd Qu.:153.0
## Max. :1566.00 Max. :1377.0 Max. :487.0
## Errors Salary NewLeague
## Min. : 0.000 Min. :4.212 A:108
## 1st Qu.: 3.000 1st Qu.:5.267 N: 92
## Median : 6.000 Median :6.088
## Mean : 8.125 Mean :5.940
## 3rd Qu.:11.250 3rd Qu.:6.620
## Max. :26.000 Max. :7.808
summary(datosH_test)
## AtBat Hits HmRun Runs
## Min. : 19.0 Min. : 4.0 Min. : 0.000 Min. : 2.00
## 1st Qu.:284.5 1st Qu.: 73.0 1st Qu.: 3.000 1st Qu.: 36.00
## Median :461.0 Median :117.0 Median : 8.000 Median : 54.00
## Mean :426.0 Mean :113.4 Mean : 8.905 Mean : 54.67
## 3rd Qu.:553.0 3rd Qu.:144.5 3rd Qu.:14.000 3rd Qu.: 71.00
## Max. :687.0 Max. :213.0 Max. :29.000 Max. :107.00
## RBI Walks Years CAtBat
## Min. : 3.00 Min. : 1.00 Min. : 1.000 Min. : 19
## 1st Qu.: 29.00 1st Qu.: 22.50 1st Qu.: 4.000 1st Qu.: 900
## Median : 45.00 Median : 37.00 Median : 6.000 Median :1770
## Mean : 46.86 Mean : 41.32 Mean : 6.619 Mean :2423
## 3rd Qu.: 59.50 3rd Qu.: 53.50 3rd Qu.: 8.000 3rd Qu.:3154
## Max. :100.00 Max. :105.00 Max. :20.000 Max. :9528
## CHits CHmRun CRuns CRBI
## Min. : 4.0 Min. : 1.00 Min. : 2.0 Min. : 3.0
## 1st Qu.: 223.5 1st Qu.: 11.50 1st Qu.: 103.5 1st Qu.: 89.5
## Median : 448.0 Median : 27.00 Median : 226.0 Median : 163.0
## Mean : 658.2 Mean : 52.67 Mean : 322.6 Mean : 278.9
## 3rd Qu.: 873.0 3rd Qu.: 59.00 3rd Qu.: 415.0 3rd Qu.: 347.0
## Max. :2583.0 Max. :548.00 Max. :1509.0 Max. :1659.0
## CWalks League Division PutOuts Assists
## Min. : 1.0 A:30 E:32 Min. : 0.0 Min. : 0.0
## 1st Qu.: 83.0 N:33 W:31 1st Qu.: 116.0 1st Qu.: 12.5
## Median : 165.0 Median : 200.0 Median :144.0
## Mean : 227.1 Mean : 319.5 Mean :178.3
## 3rd Qu.: 279.0 3rd Qu.: 331.0 3rd Qu.:322.0
## Max. :1342.0 Max. :1320.0 Max. :492.0
## Errors Salary NewLeague
## Min. : 0.00 Min. :4.248 A:33
## 1st Qu.: 4.00 1st Qu.:5.234 N:30
## Median : 9.00 Median :5.953
## Mean :10.08 Mean :5.886
## 3rd Qu.:16.00 3rd Qu.:6.607
## Max. :32.00 Max. :7.378
Modelo
library(tree)
## Warning: package 'tree' was built under R version 4.2.3
set.seed(1234)
# Selección de parámetros para el árbol
setup <- tree.control(nobs = nrow(datosH_train),
mincut = 5,
minsize= 10,
mindev = 0.01)
# Ajuste del árbol de regresión
modelo_arbolR <- tree(Salary ~., data = datosH_train,
split = "deviance",
control = setup)
modelo_arbolR
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 166.4000 5.940
## 2) CAtBat < 1322 70 23.3000 4.971
## 4) CRuns < 92.5 43 12.6000 4.696
## 8) AtBat < 173 5 7.2760 5.321 *
## 9) AtBat > 173 38 3.1220 4.614 *
## 5) CRuns > 92.5 27 2.2610 5.409 *
## 3) CAtBat > 1322 130 42.0400 6.462
## 6) Walks < 48.5 80 20.2600 6.232
## 12) Hits < 98.5 41 9.8330 6.019
## 24) AtBat < 335 36 6.8960 6.097 *
## 25) AtBat > 335 5 1.1580 5.461 *
## 13) Hits > 98.5 39 6.6220 6.456
## 26) CHits < 590.5 12 1.8470 6.118 *
## 27) CHits > 590.5 27 2.7950 6.606 *
## 7) Walks > 48.5 50 10.8100 6.829
## 14) CRBI < 369.5 16 0.9873 6.498 *
## 15) CRBI > 369.5 34 7.2280 6.985
## 30) PutOuts < 286 20 2.6210 6.786 *
## 31) PutOuts > 286 14 2.6770 7.270 *
summary(modelo_arbolR)
##
## Regression tree:
## tree(formula = Salary ~ ., data = datosH_train, control = setup,
## split = "deviance")
## Variables actually used in tree construction:
## [1] "CAtBat" "CRuns" "AtBat" "Walks" "Hits" "CHits" "CRBI"
## [8] "PutOuts"
## Number of terminal nodes: 10
## Residual mean deviance: 0.1665 = 31.64 / 190
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.003000 -0.221300 0.009429 0.000000 0.246800 2.342000
plot(modelo_arbolR, type = "proportional")
text(modelo_arbolR, splits = TRUE, pretty = 0, cex = 0.8, col = "blue")
Evaluación del modelo
# Predicciones del modelo sobre los datos de test
predicciones <- predict(modelo_arbolR, newdata = datosH_test)
plot(x = predicciones, y = datosH_test$Salary,
main = "Predicciones modelo vs valor real",
xlab = "Predicción",
ylab = "Salary real",
col = "lightblue", pch = 19)
abline(a = 0, b=1, col = "red")
# Error MSE en test
testMSE <- mean((predicciones - datosH_test$Salary)^2)
testMSE
## [1] 0.3116231
Poda
#10-fold cross-validation
set.seed(1234)
cv_arboles <- cv.tree(modelo_arbolR, K = 10, FUN = prune.tree)
cv_arboles
## $size
## [1] 10 9 8 7 6 5 4 3 2 1
##
## $dev
## [1] 71.51617 71.08538 66.26328 66.42213 59.19829 54.92047 56.86814
## [8] 64.77678 70.52970 167.59386
##
## $k
## [1] -Inf 1.779047 1.930350 1.980293 2.206202 2.590201
## [7] 3.801387 8.432809 10.977019 101.065841
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
par(mfrow = c(1,2))
#Cost complexity pruning
plot(cv_arboles$size, cv_arboles$dev, xlab = "nodos terminales", ylab = "RSS",
type = "b", pch = 19)
plot(cv_arboles$k, cv_arboles$dev, xlab = "alpha", ylab ="RSS", type = "b")
n_mejores_nodos <- cv_arboles$size[which.min(cv_arboles$dev)]
n_mejores_nodos
## [1] 5
# Poda del árbol
modelo_arbolRpodado <- prune.tree(tree = modelo_arbolR, best = n_mejores_nodos)
par(mfrow = c(1,1))
plot(x = modelo_arbolRpodado, type = "proportional")
text(modelo_arbolRpodado, splits = TRUE, pretty = 0, cex = 0.8, col = "blue")
# Predicciones del modelo sobre los datos de test
predicciones <- predict(modelo_arbolRpodado, newdata = datosH_test)
plot(x = predicciones, y = datosH_test$Salary,
main = "Predicciones modelo pruned vs valor real",
xlab = "Predicción",
ylab = "Salary real",
col = "lightblue", pch = 19)
abline(a = 0, b=1, col = "red")
# Error MSE con datos de test
testMSE <- mean((predicciones - datosH_test$Salary)^2)
testMSE
## [1] 0.3983201
BAGGING
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
# Bagging
set.seed(12345)
modelo_bagging <- randomForest(Salary ~ ., data = datosH_train,
mtry = 19,
importance = TRUE, # evaluar importancia errores
ntree = 500)
modelo_bagging
##
## Call:
## randomForest(formula = Salary ~ ., data = datosH_train, mtry = 19, importance = TRUE, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 19
##
## Mean of squared residuals: 0.2178778
## % Var explained: 73.81
plot(modelo_bagging, col = "RED")
# Importancia de las variables en el modelo
importance(modelo_bagging)
## %IncMSE IncNodePurity
## AtBat 10.8396694 8.25852525
## Hits 6.9948540 4.12895351
## HmRun 2.4976025 1.42026709
## Runs 4.7207726 2.93971768
## RBI 2.5828719 3.12078618
## Walks 9.0427367 7.28427261
## Years 10.1727265 1.77400834
## CAtBat 38.3915676 87.47465801
## CHits 7.7941256 10.53059969
## CHmRun 10.4361761 4.83378298
## CRuns 12.8295919 10.23868390
## CRBI 10.8096437 11.11199819
## CWalks 6.1292262 5.31549502
## League -0.4724837 0.08828289
## Division -1.1006285 0.17066253
## PutOuts -0.2362155 2.78826103
## Assists -1.7612397 1.39242192
## Errors 1.7352449 1.26953850
## NewLeague 1.6809973 0.22398061
varImpPlot(modelo_bagging)
predicciones <- predict(modelo_bagging, newdata = datosH_test)
plot(x = predicciones, y = datosH_test$Salary,
main = "Predicciones modelo vs valor real",
xlab = "Predicción",
ylab = "Salary real",
col = "lightblue", pch = 19)
abline(a = 0, b=1, col = "red")
#Error MSE en test
testMSE <- mean((predicciones - datosH_test$Salary)^2)
testMSE
## [1] 0.2313487
Random Forest
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
library(parallel)
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.2.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.2.3
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.2.3
cluster <- makeCluster(detectCores() - 1)
registerDoParallel(cluster) # procesamiento paralelo
# Método de validación cruzada (10-fold)
fitControl <- trainControl(method = "cv",
number = 10,
search = "grid",
allowParallel = TRUE)
# Hiperparámetro a optimizar: número de predictores aleatorios en cada ramificación.
grid_mtry <- expand.grid(mtry = c(2:18))
# Ajuste del modelo random forest
set.seed(356)
modelo_rf <- train(Salary ~ ., data = datosH_train,
method = "rf",
metric = "RMSE",
ntree = 2500,
tuneGrid = grid_mtry,
trControl = fitControl)
modelo_rf
## Random Forest
##
## 200 samples
## 19 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 179, 180, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.4331271 0.7732640 0.3114014
## 3 0.4324308 0.7720967 0.3086762
## 4 0.4338872 0.7707804 0.3094973
## 5 0.4340999 0.7704887 0.3088605
## 6 0.4362455 0.7684412 0.3098983
## 7 0.4371085 0.7679279 0.3111786
## 8 0.4380378 0.7669149 0.3114735
## 9 0.4394862 0.7655573 0.3126704
## 10 0.4392466 0.7656575 0.3113406
## 11 0.4412955 0.7637977 0.3134650
## 12 0.4422153 0.7625834 0.3145751
## 13 0.4432065 0.7619489 0.3144955
## 14 0.4416540 0.7633560 0.3147625
## 15 0.4430513 0.7619516 0.3143935
## 16 0.4440508 0.7613117 0.3165408
## 17 0.4437872 0.7614875 0.3158825
## 18 0.4443119 0.7610086 0.3168118
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
plot(modelo_rf)
# Top 25 variables más importantes en el modelo
plot(varImp(modelo_rf, scale = FALSE), top =15)
# Predicciones del modelo sobre los datos de test
predicciones <- predict(modelo_rf, newdata = datosH_test)
plot(x = predicciones, y = datosH_test$Salary,
main = "Predicciones modelo_rf vs valor real",
xlab = "Predicción",
ylab = "Salary real",
col = "lightblue", pch = 19)
abline(a = 0, b=1, col = "red")
testMSE <- mean((predicciones - datosH_test$Salary)^2)
testMSE
## [1] 0.2120682
Stochastic Gradient Boosting
# Método de validación cruzada (10-fold)
fitControl <- trainControl(method = "cv",
number = 10,
search = "grid",
allowParallel = TRUE)
# Combinación de hiperparámetro a evaluar: 5*3*4*3*=180
grid_hiperparametros <- expand.grid(shrinkage = c(0.002, 0.005, 0.01, 0.012, 0.02),
n.trees = c(200, 300, 400),
n.minobsinnode = c(5, 7, 10, 20),
interaction.depth = c(1, 2, 3))
# Ajustamos el modelo obteniendo la combinacion de mejores hiperparametros
set.seed(356)
modelo_gboost <- train(Salary ~ ., data = datosH_train,
method = "gbm",
metric = "RMSE",
tuneGrid = grid_hiperparametros,
trControl = fitControl,
verbose = FALSE)
# Evolución del RMSE en función de los hiperparámetros
plot(modelo_gboost)
# Predicciones del modelo sobre los datos de test
predicciones <- predict(modelo_gboost, newdata = datosH_test)
plot(x = predicciones, y = datosH_test$Salary,
main = "Predicciones modelo boosting vs valor real",
xlab = "Predicción",
ylab = "Salary real",
col = "lightblue", pch = 19)
abline(a = 0, b=1, col = "red")
# Error MSE en Test
testMSE <- mean((predicciones - datosH_test$Salary)^2)
testMSE
## [1] 0.2853416