R Markdown

Se establece la ruta de trabajo

setwd(“C:/OSCAR/RESPALDO CESAR/OSCAR/APRENDIZAJE/R/R DGTIC/MODELOS PREDICTIVOS R MAYO 2026/3”) getwd() install.packages(“kableExtra”)

library(janitor) library(lubridate) library(dplyr) library(tidyverse) # Manipulación y visualización de datos library(janitor) # Limpieza de nombres library(caret) # Entrenamiento, partición y evaluación de modelos library(rsample) # Partición de datos library(recipes) # Preprocesamiento para modelado library(yardstick) # Métricas de evaluación library(broom) # Organización de resultados de modelos library(rpart) # Árboles de decisión library(rpart.plot) # Visualización de árboles library(randomForest) # Bosques aleatorios library(class) # KNN library(cluster) # Agrupamiento library(factoextra) # Visualización de clustering library(knitr) # Tablas en reportes library(kableExtra) # Formato de tablas

# El archivo basetot contiene la base de datos con oder, item y products integrados
base_tot <- read.csv("C:/OSCAR/RESPALDO CESAR/OSCAR/APRENDIZAJE/R/R DGTIC/MODELOS PREDICTIVOS R MAYO 2026/1/BASE_TOTAL.csv", header=TRUE, sep=",")

##### Se hará una regresion lineal con la variable freight_value como dependiente
##### y price, product_weight_g y volumen del producto

### Calcula el volumen del producto para una mejor interpretacion
base_tot$volumen <- base_tot$product_length_cm *
  base_tot$product_height_cm *
  base_tot$product_width_cm

head(base_tot$volumen)
## [1]  1976  4693  9576  6000 11475 42250
# Seleccionar variables de interés
# y = freight_value = variable dependiente
# x1 = price, x2 = product_weight_g, x3 = volumen como variables independientes



##### Separacion de datos de entrenamiento y test
set.seed(123)

# Separar datos
idx <- sample(seq_len(nrow(base_tot)),
              size = 0.7*nrow(base_tot))

train <- base_tot[idx, ]
test  <- base_tot[-idx, ]

# Ajustar modelo
modelo_1 <- lm(freight_value ~ price +
              product_weight_g +
              volumen,
              data = base_tot
)

# Resumen del modelo
summary(modelo_1)
## 
## Call:
## lm(formula = freight_value ~ price + product_weight_g + volumen, 
##     data = base_tot)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -99.42  -5.52  -0.83   2.79 326.50 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.210e+01  4.522e-02  267.70   <2e-16 ***
## price            1.951e-02  2.035e-04   95.91   <2e-16 ***
## product_weight_g 1.381e-03  1.592e-05   86.77   <2e-16 ***
## volumen          1.732e-04  2.514e-06   68.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.78 on 112628 degrees of freedom
##   (793 observations deleted due to missingness)
## Multiple R-squared:  0.4447, Adjusted R-squared:  0.4447 
## F-statistic: 3.006e+04 on 3 and 112628 DF,  p-value: < 2.2e-16
# Predicciones
pred_1 <- predict(modelo_1, newdata = test)
summary(pred_1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.31   14.25   16.49   19.96   20.77  151.04     230
##### pred_1 no está integrado a base_tot, se puede verificar con esta función que da FALSE
"pred_1" %in% names(base_tot)
## [1] FALSE
# Comparar valores reales vs predichos, se agregan las predicciones a test
test$pred_1 <- pred_1
head(test)
##                            order_id                      customer_id
## 2  53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef
## 3  47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089
## 4  949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82
## 6  a4591c265e18cb1dcee52889e2d8acc3 503740e9ca751ccdda7ba28e9ab8f608
## 9  76c6e866289321a7c93b82b54852dc33 f54a9f0e6b351c431402b8461ea51999
## 11 e6ce16cb79ec1d90b1da9085a6118aeb 494dded5b201313c64ed7f100595b95c
##    order_status order_purchase_timestamp   order_approved_at
## 2     delivered      2018-07-24 20:41:37 2018-07-26 03:24:27
## 3     delivered      2018-08-08 08:38:49 2018-08-08 08:55:23
## 4     delivered      2017-11-18 19:28:06 2017-11-18 19:45:59
## 6     delivered      2017-07-09 21:57:05 2017-07-09 22:10:13
## 9     delivered      2017-01-23 18:29:09 2017-01-25 02:50:47
## 11    delivered      2017-05-16 19:41:10 2017-05-16 19:50:18
##    order_delivered_carrier_date order_delivered_customer_date
## 2           2018-07-26 14:31:00           2018-08-07 15:27:45
## 3           2018-08-08 13:50:00           2018-08-17 18:06:29
## 4           2017-11-22 13:39:59           2017-12-02 00:28:42
## 6           2017-07-11 14:58:04           2017-07-26 10:57:55
## 9           2017-01-26 14:16:31           2017-02-02 14:08:10
## 11          2017-05-18 11:40:40           2017-05-29 11:18:31
##    order_estimated_delivery_date   purchase   approved delivered_carr
## 2            2018-08-13 00:00:00 2018-07-24 2018-07-26     2018-07-26
## 3            2018-09-04 00:00:00 2018-08-08 2018-08-08     2018-08-08
## 4            2017-12-15 00:00:00 2017-11-18 2017-11-18     2017-11-22
## 6            2017-08-01 00:00:00 2017-07-09 2017-07-09     2017-07-11
## 9            2017-03-06 00:00:00 2017-01-23 2017-01-25     2017-01-26
## 11           2017-06-07 00:00:00 2017-05-16 2017-05-16     2017-05-18
##    delivered_cust est_delivery order_item_id                       product_id
## 2      2018-08-07   2018-08-13             1 595fac2a385ac33a80bd5114aec74eb8
## 3      2018-08-17   2018-09-04             1 aa4383b373c6aca5d8797843e5594415
## 4      2017-12-02   2017-12-15             1 d0b61bfb1de832b15ba9d266ca96e5b0
## 6      2017-07-26   2017-08-01             1 060cb19345d90064d1015407193c233d
## 9      2017-02-02   2017-03-06             1 ac1789e492dcd698c5c10b97a671243a
## 11     2017-05-29   2017-06-07             1 08574b074924071f4e201e151b152b4e
##                           seller_id shipping_limit_date price freight_value
## 2  289cdb325fb7e7f891c38608bf9e0962 2018-07-30 03:24:27 118.7         22.76
## 3  4869f7a5dfa277a7dca6462dcf3b52b2 2018-08-13 08:55:23 159.9         19.22
## 4  66922902710d126a0e7d26b0e3805106 2017-11-23 19:45:59  45.0         27.20
## 6  8581055ce74af1daba164fdbd55a40de 2017-07-13 22:10:13 147.9         27.36
## 9  63b9ae557efed31d1f7687917d248a8d 2017-01-27 18:29:09  19.9         16.05
## 11 001cca7ae9ae17fb1caed9dfb1094831 2017-05-22 19:50:18  99.0         30.53
##    shipping_limit product_category_name product_name_lenght
## 2      2018-07-30            perfumaria                  29
## 3      2018-08-13            automotivo                  46
## 4      2017-11-23              pet_shop                  59
## 6      2017-07-13            automotivo                  49
## 9      2017-01-27      moveis_decoracao                  41
## 11     2017-05-22    ferramentas_jardim                  36
##    product_description_lenght product_photos_qty product_weight_g
## 2                         178                  1              400
## 3                         232                  1              420
## 4                         468                  3              450
## 6                         608                  1             7150
## 9                         432                  2              300
## 11                        450                  1             9000
##    product_length_cm product_height_cm product_width_cm volumen   pred_1
## 2                 19                13               19    4693 15.78643
## 3                 24                19               21    9576 17.46384
## 4                 30                10               20    6000 14.64364
## 6                 65                10               65   42250 32.18373
## 9                 35                35               15   18375 16.09015
## 11                42                12               39   19656 29.87093
#test |>
#  select(freight_value, pred_1) |>
#  head()


library(ggplot2)
ggplot(test, aes(x = freight_value, y = pred_1)) +
  geom_point(alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  labs(
    x = "Valor real",
    y = "Valor predicho",
    title = "Valores reales vs predichos"
  ) +
  theme_minimal()
## Warning: Removed 230 rows containing missing values or values outside the scale range
## (`geom_point()`).

##### Se buscan los valores NA 
sum(is.na(test$freight_value))
## [1] 227
sum(is.na(pred_1))
## [1] 230
rmse <- sqrt(
  mean(
    (test$freight_value - pred_1)^2,
    na.rm = TRUE
  )
)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.