# Árbol de decisión completo
tree_model <- tree(Salary ~ ., data = train_data)
## Warning in tree(Salary ~ ., data = train_data): NAs introduced by coercion
summary(tree_model)
##
## Regression tree:
## tree(formula = Salary ~ ., data = train_data)
## Variables actually used in tree construction:
## [1] "CHits" "Walks" "AtBat" "CRBI" "CHmRun" "Errors"
## Number of terminal nodes: 10
## Residual mean deviance: 45660 = 7991000 / 175
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -590.2 -101.7 -23.9 0.0 86.1 915.0
plot(tree_model)
text(tree_model, pretty = 0)

# Validación cruzada y poda del árbol a 7 hojas
set.seed(123)
cv_model <- cv.tree(tree_model)
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
plot(cv_model$size, cv_model$dev, type = "b", xlab = "Tamaño del árbol", ylab = "Error")

pruned_tree <- prune.tree(tree_model, best = 7)
plot(pruned_tree)
text(pruned_tree, pretty = 0)

# Predicciones árbol podado
pred_tree <- predict(pruned_tree, newdata = test_data)
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
rmse_tree <- RMSE(pred_tree, test_data$Salary)
rmse_tree
## [1] 342.5687
# Random Forest
set.seed(123)
rf_model <- randomForest(Salary ~ ., data = train_data, importance = TRUE)
pred_rf <- predict(rf_model, newdata = test_data)
rmse_rf <- RMSE(pred_rf, test_data$Salary)
rmse_rf
## [1] 331.21
importance(rf_model)
## %IncMSE IncNodePurity
## X -2.9440890 636671.8
## AtBat 6.4333306 1877818.6
## HmRun 4.1665978 1159215.4
## Runs 5.4403551 1751332.8
## RBI 6.0989471 2387936.7
## Walks 5.3529754 2389409.9
## CAtBat 11.7223127 4227951.0
## CHits 12.1830085 4230802.8
## CHmRun 8.8117812 3612452.4
## CRuns 13.2284083 5634241.3
## CRBI 8.7171044 3631415.4
## CWalks 5.9968772 2595483.2
## League 1.3053146 113019.2
## Division 1.9411279 127438.8
## PutOuts 2.1718956 1544105.5
## Assists -2.5665002 580220.6
## Errors 2.1149669 553463.8
## NewLeague -0.3091606 106589.5
varImpPlot(rf_model)

# Árbol reducido con CHits y CRuns
hitters_model_data <- hitters %>% select(Salary, CHits, CRuns)
train_simple <- hitters_model_data[train_index, ]
test_simple <- hitters_model_data[-train_index, ]
simple_tree <- tree(Salary ~ ., data = train_simple)
pruned_simple_tree <- prune.tree(simple_tree, best = 7)
plot(pruned_simple_tree)
text(pruned_simple_tree, pretty = 0)

# Scatterplot con líneas para mostrar regiones usando annotate
ggplot(hitters_model_data, aes(x = CHits, y = CRuns)) +
geom_point(alpha = 0.7) +
annotate("segment", x = 180, xend = 180, y = 0, yend = 150, linetype = "dashed") +
annotate("segment", x = 220, xend = 220, y = 0, yend = 150, linetype = "dashed") +
annotate("segment", x = 0, xend = 300, y = 60, yend = 60, linetype = "dashed") +
annotate("segment", x = 0, xend = 300, y = 110, yend = 110, linetype = "dashed") +
labs(title = "División en regiones con CHits y CRuns", x = "CHits", y = "CRuns")

# Conclusiones
cat("• El modelo de Random Forest fue el más preciso (RMSE ≈", round(rmse_rf, 2), "), mejor que el árbol podado (RMSE ≈", round(rmse_tree, 2), ").\n")
## • El modelo de Random Forest fue el más preciso (RMSE ≈ 331.21 ), mejor que el árbol podado (RMSE ≈ 342.57 ).
cat("• CHits y CRuns fueron las variables más importantes.\n")
## • CHits y CRuns fueron las variables más importantes.
cat("• El modelo simplificado con estas dos variables y 7 hojas permite interpretar visualmente cómo se segmenta el espacio.")
## • El modelo simplificado con estas dos variables y 7 hojas permite interpretar visualmente cómo se segmenta el espacio.