## Rows: 400
## Columns: 11
## $ Sales <dbl> 9.50, 11.22, 10.06, 7.40, 4.15, 10.81, 6.63, 11.85, 6.54, …
## $ CompPrice <dbl> 138, 111, 113, 117, 141, 124, 115, 136, 132, 132, 121, 117…
## $ Income <dbl> 73, 48, 35, 100, 64, 113, 105, 81, 110, 113, 78, 94, 35, 2…
## $ Advertising <dbl> 11, 16, 10, 4, 3, 13, 0, 15, 0, 0, 9, 4, 2, 11, 11, 5, 0, …
## $ Population <dbl> 276, 260, 269, 466, 340, 501, 45, 425, 108, 131, 150, 503,…
## $ Price <dbl> 120, 83, 80, 97, 128, 72, 108, 120, 124, 124, 100, 94, 136…
## $ ShelveLoc <fct> Bad, Good, Medium, Medium, Bad, Bad, Medium, Good, Medium,…
## $ Age <dbl> 42, 65, 59, 55, 38, 78, 71, 67, 76, 76, 26, 50, 62, 53, 52…
## $ Education <dbl> 17, 10, 12, 14, 13, 16, 15, 10, 10, 17, 10, 13, 18, 18, 18…
## $ Urban <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Ye…
## $ US <fct> Yes, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Yes, N…
## Número total de valores NA: 0
## Valores faltantes por variable:
## Sales CompPrice Income Advertising Population Price
## 0 0 0 0 0 0
## ShelveLoc Age Education Urban US
## 0 0 0 0 0
La base de datos Carseats no presenta valores faltantes, por lo que no fue necesario eliminar observaciones.
## Número de observaciones: 400
## Número de variables: 11
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
Las variables categóricas (ShelveLoc,
Urban, US) ya se encuentran codificadas como
factores, por lo que los modelos de árbol pueden utilizarlas
directamente.
numeric_data <- Carseats %>%
select(where(is.numeric))
hist_data <- numeric_data %>%
pivot_longer(
cols = everything(),
names_to = "Variable",
values_to = "Valor"
)
ggplot(hist_data, aes(x = Valor)) +
geom_histogram(
bins = 30,
fill = "#5c9e82",
color = "white"
) +
facet_wrap(
~ Variable,
scales = "free"
) +
labs(
title = "Histogramas de Variables Numéricas",
x = "Valor",
y = "Frecuencia"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold")
)Sales presenta una distribución aproximadamente
simétrica.Advertising presenta una concentración estructural en
cero, indicando múltiples tiendas sin gasto publicitario.Price y CompPrice muestran distribuciones
relativamente normales.Population presenta mayor dispersión.cor_matrix <- cor(numeric_data)
corrplot(
cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.6,
tl.col = "black",
tl.cex = 0.8,
col = colorRampPalette(
c("#B85042", "white", "#5c9e82")
)(200)
)Price presenta correlación negativa moderada con
Sales.Advertising presenta relación positiva con
Sales.CompPrice y
Price.ggplot(hist_data, aes(y = Valor)) +
geom_boxplot(
fill = "#5c9e82",
outlier.color = "tomato",
outlier.shape = 16
) +
facet_wrap(
~ Variable,
scales = "free"
) +
labs(
title = "Boxplots de Variables Numéricas",
y = "Valor",
x = ""
) +
theme_minimal(base_size = 13) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(face = "bold")
)Price,
CompPrice y Sales.skew_table <- numeric_data %>%
summarise(
across(
everything(),
~ skewness(.x, na.rm = TRUE)
)
) %>%
pivot_longer(
everything(),
names_to = "Variable",
values_to = "Skewness"
) %>%
mutate(
Interpretacion = case_when(
Skewness > 0.5 ~ "Sesgo derecho",
Skewness < -0.5 ~ "Sesgo izquierdo",
TRUE ~ "Distribución aproximadamente simétrica"
)
)
skew_table## # A tibble: 8 × 3
## Variable Skewness Interpretacion
## <chr> <dbl> <chr>
## 1 Sales 0.184 Distribución aproximadamente simétrica
## 2 CompPrice -0.0424 Distribución aproximadamente simétrica
## 3 Income 0.0491 Distribución aproximadamente simétrica
## 4 Advertising 0.635 Sesgo derecho
## 5 Population -0.0508 Distribución aproximadamente simétrica
## 6 Price -0.124 Distribución aproximadamente simétrica
## 7 Age -0.0766 Distribución aproximadamente simétrica
## 8 Education 0.0437 Distribución aproximadamente simétrica
Ninguna variable presenta un sesgo absoluto superior a 1, por lo que no fue necesario aplicar transformaciones logarítmicas.
set.seed(20260417)
n <- nrow(Carseats)
train_index <- sample(
seq_len(n),
size = floor(0.70 * n)
)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
cat("Observaciones entrenamiento:", nrow(train), "\n")## Observaciones entrenamiento: 280
## Observaciones prueba: 120
##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "Income"
## Number of terminal nodes: 16
## Residual mean deviance: 2.29 = 604.6 / 264
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.09100 -1.06100 0.07167 0.00000 1.05300 3.94900
plot(tree_full)
text(
tree_full,
pretty = 0,
cex = 0.7
)
title(
main = "Árbol de Decisión Completo"
)metricas <- function(real, pred){
mse <- mean((pred - real)^2)
rmse <- sqrt(mse)
r2 <- 1 -
sum((pred - real)^2) /
sum((real - mean(real))^2)
data.frame(
MSE = mse,
RMSE = rmse,
R2 = r2
)
}pred_full <- predict(
tree_full,
newdata = test
)
metrics_full <- metricas(
test$Sales,
pred_full
)
metrics_full## MSE RMSE R2
## 1 4.681081 2.163581 0.4506594
cv_results <- cv.tree(tree_full)
cv_df <- data.frame(
size = cv_results$size,
dev = cv_results$dev
)
best_size <- min(
cv_df$size[
cv_df$dev == min(cv_df$dev)
]
)
ggplot(cv_df, aes(x = size, y = dev)) +
geom_line(
linewidth = 1,
color = "#5c9e82"
) +
geom_point(
size = 3,
color = "#5c9e82"
) +
geom_vline(
xintercept = best_size,
linetype = "dashed",
linewidth = 1,
color = "tomato"
) +
scale_x_continuous(
breaks = cv_df$size
) +
labs(
title = "Validación Cruzada del Árbol",
subtitle = "Línea roja = tamaño óptimo",
x = "Número de hojas",
y = "Error de validación cruzada"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold")
)##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "Income"
## Number of terminal nodes: 16
## Residual mean deviance: 2.29 = 604.6 / 264
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.09100 -1.06100 0.07167 0.00000 1.05300 3.94900
## size dev
## 1 16 1157.278
## 2 15 1182.267
## 3 14 1187.045
## 4 13 1190.864
## 5 12 1183.192
## 6 11 1205.088
## 7 10 1197.174
## 8 9 1196.813
## 9 8 1198.434
## 10 7 1294.273
## 11 6 1281.444
## 12 5 1307.408
## 13 4 1374.326
## 14 3 1415.657
## 15 2 1587.279
## 16 1 2170.701
plot(tree_pruned)
text(
tree_pruned,
pretty = 0,
cex = 0.8
)
title(
main = paste(
"Árbol Podado —",
best_size,
"hojas"
)
)pred_pruned <- predict(
tree_pruned,
newdata = test
)
metrics_pruned <- metricas(
test$Sales,
pred_pruned
)
metrics_pruned## MSE RMSE R2
## 1 4.681081 2.163581 0.4506594
best_pred <- if(
metrics_pruned$MSE <= metrics_full$MSE
){
pred_pruned
} else {
pred_full
}
best_name <- if(
metrics_pruned$MSE <= metrics_full$MSE
){
"Árbol Podado"
} else {
"Árbol Completo"
}
pred_df <- data.frame(
Real = test$Sales,
Predicho = best_pred
)
ggplot(pred_df, aes(x = Real, y = Predicho)) +
geom_point(
color = "#5c9e82",
alpha = 0.75,
size = 3
) +
geom_abline(
slope = 1,
intercept = 0,
color = "tomato",
linewidth = 1,
linetype = "dashed"
) +
geom_smooth(
method = "lm",
se = FALSE,
color = "darkblue",
linewidth = 1
) +
labs(
title = paste(
"Predicciones vs Valores Reales —",
best_name
),
subtitle = "Línea roja = predicción perfecta",
x = "Sales Real",
y = "Sales Predicho"
) +
theme_minimal(base_size = 13)p <- ncol(train) - 1
set.seed(20260417)
bagging_model <- randomForest(
Sales ~ .,
data = train,
mtry = p,
ntree = 500,
importance = TRUE
)
bagging_model##
## Call:
## randomForest(formula = Sales ~ ., data = train, mtry = p, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.426186
## % Var explained: 68.53
set.seed(20260417)
rf_model <- randomForest(
Sales ~ .,
data = train,
mtry = floor(p / 3),
ntree = 500,
importance = TRUE
)
rf_model##
## Call:
## randomForest(formula = Sales ~ ., data = train, mtry = floor(p/3), ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 2.711411
## % Var explained: 64.83
pred_bagging <- predict(
bagging_model,
newdata = test
)
pred_rf <- predict(
rf_model,
newdata = test
)
metrics_bagging <- metricas(
test$Sales,
pred_bagging
)
metrics_rf <- metricas(
test$Sales,
pred_rf
)
metrics_bagging## MSE RMSE R2
## 1 2.969186 1.723133 0.6515561
## MSE RMSE R2
## 1 3.326952 1.823993 0.609571
comparison <- data.frame(
Modelo = c(
"Árbol Completo",
"Árbol Podado",
"Bagging",
"Random Forest"
),
MSE = c(
metrics_full$MSE,
metrics_pruned$MSE,
metrics_bagging$MSE,
metrics_rf$MSE
),
RMSE = c(
metrics_full$RMSE,
metrics_pruned$RMSE,
metrics_bagging$RMSE,
metrics_rf$RMSE
),
R2 = c(
metrics_full$R2,
metrics_pruned$R2,
metrics_bagging$R2,
metrics_rf$R2
)
)
comparison <- comparison %>%
mutate(
across(
c(MSE, RMSE, R2),
~ round(.x, 4)
)
)
comparison## Modelo MSE RMSE R2
## 1 Árbol Completo 4.6811 2.1636 0.4507
## 2 Árbol Podado 4.6811 2.1636 0.4507
## 3 Bagging 2.9692 1.7231 0.6516
## 4 Random Forest 3.3270 1.8240 0.6096
El modelo de Bagging obtuvo el menor MSE y el mayor valor de R², indicando el mejor desempeño predictivo sobre el conjunto de prueba. Random Forest también superó ampliamente a los árboles individuales, aunque con resultados ligeramente inferiores al Bagging.
importance_df <- data.frame(
Variable = rownames(
importance(rf_model)
),
IncMSE = importance(rf_model)[,"%IncMSE"]
)
importance_df <- importance_df %>%
arrange(desc(IncMSE))
ggplot(
importance_df,
aes(
x = reorder(Variable, IncMSE),
y = IncMSE
)
) +
geom_col(
fill = "#3b82f6"
) +
coord_flip() +
labs(
title = "Importancia de Variables — Random Forest",
x = "",
y = "% Incremento en MSE"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold")
)top2 <- importance_df$Variable[1:2]
cat(
"Las dos variables más importantes fueron:",
top2[1],
"y",
top2[2]
)## Las dos variables más importantes fueron: ShelveLoc y Price
Price mostró una fuerte relación negativa con
ventas:
ShelveLoc fue clave porque la ubicación del producto
influye directamente en visibilidad y demanda
Estas variables dominaron tanto:
árboles individuales
como modelos ensemble
formula_top2 <- as.formula(
paste(
"Sales ~",
paste(top2, collapse = " + ")
)
)
tree_7 <- tree(
formula_top2,
data = train
)
tree_7 <- prune.tree(
tree_7,
best = 7
)
summary(tree_7)##
## Regression tree:
## snip.tree(tree = tree_7, nodes = c(6L, 8L))
## Number of terminal nodes: 7
## Residual mean deviance: 3.717 = 1015 / 273
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.86200 -1.35200 0.06014 0.00000 1.34400 4.86700
pred_tree7 <- predict(
tree_7,
newdata = test
)
metrics_tree7 <- metricas(
test$Sales,
pred_tree7
)
metrics_tree7## MSE RMSE R2
## 1 5.20449 2.281335 0.3892356
x_var <- top2[1]
y_var <- top2[2]
plot_data <- train %>%
mutate(
region = factor(
round(
predict(tree_7, newdata = train),
2
)
)
)
nodos <- tree_7$frame[
tree_7$frame$var != "<leaf>",
]
p <- ggplot(
plot_data,
aes(
x = .data[[x_var]],
y = .data[[y_var]],
color = region
)
) +
geom_point(
size = 3,
alpha = 0.75
) +
scale_color_brewer(
palette = "Set1",
name = "Región"
) +
labs(
title = "Scatter Plot con 7 Regiones de Decisión",
subtitle = paste(
"Variables más importantes:",
x_var,
"y",
y_var
),
x = x_var,
y = y_var,
caption = "Cada color representa una región terminal del árbol"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "right"
)
for(i in 1:nrow(nodos)) {
split_var <- as.character(
nodos$var[i]
)
split_val <- as.numeric(gsub("[^0-9.]", "", tree_7$splits[i, "cutleft"]))
if(split_var == x_var){
p <- p +
geom_vline(
xintercept = split_val,
linetype = "dashed",
linewidth = 1,
color = "black"
)
}
if(split_var == y_var){
p <- p +
geom_hline(
yintercept = split_val,
linetype = "dashed",
linewidth = 1,
color = "black"
)
}
}
pregions_table <- tree_7$frame %>%
filter(var == "<leaf>") %>%
mutate(
Sales_Pred = round(yval, 2)
) %>%
select(
Sales_Pred,
n
) %>%
arrange(Sales_Pred)
regions_table$Region <- seq_len(
nrow(regions_table)
)
regions_table <- regions_table[
,
c("Region", "Sales_Pred", "n")
]
names(regions_table) <- c(
"Región",
"Sales Predicho",
"Número de Observaciones"
)
regions_table## Región Sales Predicho Número de Observaciones
## 1 1 4.10 23
## 2 2 5.86 41
## 3 3 6.31 47
## 4 4 7.46 86
## 5 5 7.67 15
## 6 6 9.25 20
## 7 7 11.15 48