setwd(“C:/OSCAR/RESPALDO CESAR/OSCAR/APRENDIZAJE/R/R DGTIC/MODELOS PREDICTIVOS R MAYO 2026/3”) getwd() install.packages(“kableExtra”)
library(janitor) library(lubridate) library(dplyr) library(tidyverse) # Manipulación y visualización de datos library(janitor) # Limpieza de nombres library(caret) # Entrenamiento, partición y evaluación de modelos library(rsample) # Partición de datos library(recipes) # Preprocesamiento para modelado library(yardstick) # Métricas de evaluación library(broom) # Organización de resultados de modelos library(rpart) # Árboles de decisión library(rpart.plot) # Visualización de árboles library(randomForest) # Bosques aleatorios library(class) # KNN library(cluster) # Agrupamiento library(factoextra) # Visualización de clustering library(knitr) # Tablas en reportes library(kableExtra) # Formato de tablas
# El archivo basetot contiene la base de datos con oder, item y products integrados
base_tot <- read.csv("C:/OSCAR/RESPALDO CESAR/OSCAR/APRENDIZAJE/R/R DGTIC/MODELOS PREDICTIVOS R MAYO 2026/1/BASE_TOTAL.csv", header=TRUE, sep=",")
##### Se hará una regresion lineal con la variable freight_value como dependiente
##### y price, product_weight_g y volumen del producto
### Calcula el volumen del producto para una mejor interpretacion
base_tot$volumen <- base_tot$product_length_cm *
base_tot$product_height_cm *
base_tot$product_width_cm
head(base_tot$volumen)
## [1] 1976 4693 9576 6000 11475 42250
# Seleccionar variables de interés
# y = freight_value = variable dependiente
# x1 = price, x2 = product_weight_g, x3 = volumen como variables independientes
##### Separacion de datos de entrenamiento y test
set.seed(123)
# Separar datos
idx <- sample(seq_len(nrow(base_tot)),
size = 0.7*nrow(base_tot))
train <- base_tot[idx, ]
test <- base_tot[-idx, ]
# Ajustar modelo
modelo_1 <- lm(freight_value ~ price +
product_weight_g +
volumen,
data = base_tot
)
# Resumen del modelo
summary(modelo_1)
##
## Call:
## lm(formula = freight_value ~ price + product_weight_g + volumen,
## data = base_tot)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99.42 -5.52 -0.83 2.79 326.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.210e+01 4.522e-02 267.70 <2e-16 ***
## price 1.951e-02 2.035e-04 95.91 <2e-16 ***
## product_weight_g 1.381e-03 1.592e-05 86.77 <2e-16 ***
## volumen 1.732e-04 2.514e-06 68.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.78 on 112628 degrees of freedom
## (793 observations deleted due to missingness)
## Multiple R-squared: 0.4447, Adjusted R-squared: 0.4447
## F-statistic: 3.006e+04 on 3 and 112628 DF, p-value: < 2.2e-16
# Predicciones
pred_1 <- predict(modelo_1, newdata = test)
summary(pred_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.31 14.25 16.49 19.96 20.77 151.04 230
##### pred_1 no está integrado a base_tot, se puede verificar con esta función que da FALSE
"pred_1" %in% names(base_tot)
## [1] FALSE
# Comparar valores reales vs predichos, se agregan las predicciones a test
test$pred_1 <- pred_1
head(test)
## order_id customer_id
## 2 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef
## 3 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089
## 4 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82
## 6 a4591c265e18cb1dcee52889e2d8acc3 503740e9ca751ccdda7ba28e9ab8f608
## 9 76c6e866289321a7c93b82b54852dc33 f54a9f0e6b351c431402b8461ea51999
## 11 e6ce16cb79ec1d90b1da9085a6118aeb 494dded5b201313c64ed7f100595b95c
## order_status order_purchase_timestamp order_approved_at
## 2 delivered 2018-07-24 20:41:37 2018-07-26 03:24:27
## 3 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23
## 4 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59
## 6 delivered 2017-07-09 21:57:05 2017-07-09 22:10:13
## 9 delivered 2017-01-23 18:29:09 2017-01-25 02:50:47
## 11 delivered 2017-05-16 19:41:10 2017-05-16 19:50:18
## order_delivered_carrier_date order_delivered_customer_date
## 2 2018-07-26 14:31:00 2018-08-07 15:27:45
## 3 2018-08-08 13:50:00 2018-08-17 18:06:29
## 4 2017-11-22 13:39:59 2017-12-02 00:28:42
## 6 2017-07-11 14:58:04 2017-07-26 10:57:55
## 9 2017-01-26 14:16:31 2017-02-02 14:08:10
## 11 2017-05-18 11:40:40 2017-05-29 11:18:31
## order_estimated_delivery_date purchase approved delivered_carr
## 2 2018-08-13 00:00:00 2018-07-24 2018-07-26 2018-07-26
## 3 2018-09-04 00:00:00 2018-08-08 2018-08-08 2018-08-08
## 4 2017-12-15 00:00:00 2017-11-18 2017-11-18 2017-11-22
## 6 2017-08-01 00:00:00 2017-07-09 2017-07-09 2017-07-11
## 9 2017-03-06 00:00:00 2017-01-23 2017-01-25 2017-01-26
## 11 2017-06-07 00:00:00 2017-05-16 2017-05-16 2017-05-18
## delivered_cust est_delivery order_item_id product_id
## 2 2018-08-07 2018-08-13 1 595fac2a385ac33a80bd5114aec74eb8
## 3 2018-08-17 2018-09-04 1 aa4383b373c6aca5d8797843e5594415
## 4 2017-12-02 2017-12-15 1 d0b61bfb1de832b15ba9d266ca96e5b0
## 6 2017-07-26 2017-08-01 1 060cb19345d90064d1015407193c233d
## 9 2017-02-02 2017-03-06 1 ac1789e492dcd698c5c10b97a671243a
## 11 2017-05-29 2017-06-07 1 08574b074924071f4e201e151b152b4e
## seller_id shipping_limit_date price freight_value
## 2 289cdb325fb7e7f891c38608bf9e0962 2018-07-30 03:24:27 118.7 22.76
## 3 4869f7a5dfa277a7dca6462dcf3b52b2 2018-08-13 08:55:23 159.9 19.22
## 4 66922902710d126a0e7d26b0e3805106 2017-11-23 19:45:59 45.0 27.20
## 6 8581055ce74af1daba164fdbd55a40de 2017-07-13 22:10:13 147.9 27.36
## 9 63b9ae557efed31d1f7687917d248a8d 2017-01-27 18:29:09 19.9 16.05
## 11 001cca7ae9ae17fb1caed9dfb1094831 2017-05-22 19:50:18 99.0 30.53
## shipping_limit product_category_name product_name_lenght
## 2 2018-07-30 perfumaria 29
## 3 2018-08-13 automotivo 46
## 4 2017-11-23 pet_shop 59
## 6 2017-07-13 automotivo 49
## 9 2017-01-27 moveis_decoracao 41
## 11 2017-05-22 ferramentas_jardim 36
## product_description_lenght product_photos_qty product_weight_g
## 2 178 1 400
## 3 232 1 420
## 4 468 3 450
## 6 608 1 7150
## 9 432 2 300
## 11 450 1 9000
## product_length_cm product_height_cm product_width_cm volumen pred_1
## 2 19 13 19 4693 15.78643
## 3 24 19 21 9576 17.46384
## 4 30 10 20 6000 14.64364
## 6 65 10 65 42250 32.18373
## 9 35 35 15 18375 16.09015
## 11 42 12 39 19656 29.87093
#test |>
# select(freight_value, pred_1) |>
# head()
library(ggplot2)
ggplot(test, aes(x = freight_value, y = pred_1)) +
geom_point(alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
labs(
x = "Valor real",
y = "Valor predicho",
title = "Valores reales vs predichos"
) +
theme_minimal()
## Warning: Removed 230 rows containing missing values or values outside the scale range
## (`geom_point()`).
##### Se buscan los valores NA
sum(is.na(test$freight_value))
## [1] 227
sum(is.na(pred_1))
## [1] 230
rmse <- sqrt(
mean(
(test$freight_value - pred_1)^2,
na.rm = TRUE
)
)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.