library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(skimr)
## Warning: package 'skimr' was built under R version 4.5.2
library(purrr)

Directorio de Trabajo

setwd("C:/Users/marco.albarran/Desktop/Modelos Predictivos con Lenguaje R")
getwd()
## [1] "C:/Users/marco.albarran/Desktop/Modelos Predictivos con Lenguaje R"

Importación de los archivos utilizando la función read_csv()

items <- read_csv("olist_order_items_dataset.csv")
## Rows: 112650 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): order_id, product_id, seller_id
## dbl  (3): order_item_id, price, freight_value
## dttm (1): shipping_limit_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
orders <- read_csv("olist_orders_dataset.csv")
## Rows: 99441 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): order_id, customer_id, order_status
## dttm (5): order_purchase_timestamp, order_approved_at, order_delivered_carri...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
products <- read_csv("olist_products_dataset.csv")
## Rows: 32951 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): product_id, product_category_name
## dbl (7): product_name_lenght, product_description_lenght, product_photos_qty...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Revisar cada dataset

str(c(items, orders, products))
## List of 24
##  $ order_id                     : chr [1:112650] "00010242fe8c5a6d1ba2dd792cb16214" "00018f77f2f0320c557190d7a144bdd3" "000229ec398224ef6ca0657da4fc703e" "00024acbcdf0a6daa1e931b038114c75" ...
##  $ order_item_id                : num [1:112650] 1 1 1 1 1 1 1 1 1 1 ...
##  $ product_id                   : chr [1:112650] "4244733e06e7ecb4970a6e2683c13e61" "e5f2d52b802189ee658865ca93d83a8f" "c777355d18b72b67abbeef9df44fd0fd" "7634da152a4610f1595efa32f14722fc" ...
##  $ seller_id                    : chr [1:112650] "48436dade18ac8b2bce089ec2a041202" "dd7ddc04e1b6c2c614352b383efe2d36" "5b51032eddd242adc84c38acab88f23d" "9d7a1d34a5052409006425275ba1c2b4" ...
##  $ shipping_limit_date          : POSIXct[1:112650], format: "2017-09-19 09:45:35" "2017-05-03 11:05:13" ...
##  $ price                        : num [1:112650] 58.9 239.9 199 13 199.9 ...
##  $ freight_value                : num [1:112650] 13.3 19.9 17.9 12.8 18.1 ...
##  $ order_id                     : chr [1:99441] "e481f51cbdc54678b7cc49136f2d6af7" "53cdb2fc8bc7dce0b6741e2150273451" "47770eb9100c2d0c44946d9cf07ec65d" "949d5b44dbf5de918fe9c16f97b45f8a" ...
##  $ customer_id                  : chr [1:99441] "9ef432eb6251297304e76186b10a928d" "b0830fb4747a6c6d20dea0b8c802d7ef" "41ce2a54c0b03bf3443c3d931a367089" "f88197465ea7920adcdbec7375364d82" ...
##  $ order_status                 : chr [1:99441] "delivered" "delivered" "delivered" "delivered" ...
##  $ order_purchase_timestamp     : POSIXct[1:99441], format: "2017-10-02 10:56:33" "2018-07-24 20:41:37" ...
##  $ order_approved_at            : POSIXct[1:99441], format: "2017-10-02 11:07:15" "2018-07-26 03:24:27" ...
##  $ order_delivered_carrier_date : POSIXct[1:99441], format: "2017-10-04 19:55:00" "2018-07-26 14:31:00" ...
##  $ order_delivered_customer_date: POSIXct[1:99441], format: "2017-10-10 21:25:13" "2018-08-07 15:27:45" ...
##  $ order_estimated_delivery_date: POSIXct[1:99441], format: "2017-10-18" "2018-08-13" ...
##  $ product_id                   : chr [1:32951] "1e9e8ef04dbcff4541ed26657ea517e5" "3aa071139cb16b67ca9e5dea641aaa2f" "96bd76ec8810374ed1b65e291975717f" "cef67bcfe19066a932b7673e239eb23d" ...
##  $ product_category_name        : chr [1:32951] "perfumaria" "artes" "esporte_lazer" "bebes" ...
##  $ product_name_lenght          : num [1:32951] 40 44 46 27 37 60 56 56 57 36 ...
##  $ product_description_lenght   : num [1:32951] 287 276 250 261 402 ...
##  $ product_photos_qty           : num [1:32951] 1 1 1 1 4 1 4 2 1 1 ...
##  $ product_weight_g             : num [1:32951] 225 1000 154 371 625 ...
##  $ product_length_cm            : num [1:32951] 16 30 18 26 20 38 70 40 27 17 ...
##  $ product_height_cm            : num [1:32951] 10 18 9 4 17 5 24 8 13 10 ...
##  $ product_width_cm             : num [1:32951] 14 20 15 26 13 11 44 40 17 12 ...
summary(c(items,orders,products))
##                               Length Class   Mode     
## order_id                      112650 -none-  character
## order_item_id                 112650 -none-  numeric  
## product_id                    112650 -none-  character
## seller_id                     112650 -none-  character
## shipping_limit_date           112650 POSIXct numeric  
## price                         112650 -none-  numeric  
## freight_value                 112650 -none-  numeric  
## order_id                       99441 -none-  character
## customer_id                    99441 -none-  character
## order_status                   99441 -none-  character
## order_purchase_timestamp       99441 POSIXct numeric  
## order_approved_at              99441 POSIXct numeric  
## order_delivered_carrier_date   99441 POSIXct numeric  
## order_delivered_customer_date  99441 POSIXct numeric  
## order_estimated_delivery_date  99441 POSIXct numeric  
## product_id                     32951 -none-  character
## product_category_name          32951 -none-  character
## product_name_lenght            32951 -none-  numeric  
## product_description_lenght     32951 -none-  numeric  
## product_photos_qty             32951 -none-  numeric  
## product_weight_g               32951 -none-  numeric  
## product_length_cm              32951 -none-  numeric  
## product_height_cm              32951 -none-  numeric  
## product_width_cm               32951 -none-  numeric
names(c(items,orders,products))
##  [1] "order_id"                      "order_item_id"                
##  [3] "product_id"                    "seller_id"                    
##  [5] "shipping_limit_date"           "price"                        
##  [7] "freight_value"                 "order_id"                     
##  [9] "customer_id"                   "order_status"                 
## [11] "order_purchase_timestamp"      "order_approved_at"            
## [13] "order_delivered_carrier_date"  "order_delivered_customer_date"
## [15] "order_estimated_delivery_date" "product_id"                   
## [17] "product_category_name"         "product_name_lenght"          
## [19] "product_description_lenght"    "product_photos_qty"           
## [21] "product_weight_g"              "product_length_cm"            
## [23] "product_height_cm"             "product_width_cm"
head(items)
## # A tibble: 6 × 7
##   order_id          order_item_id product_id seller_id shipping_limit_date price
##   <chr>                     <dbl> <chr>      <chr>     <dttm>              <dbl>
## 1 00010242fe8c5a6d…             1 4244733e0… 48436dad… 2017-09-19 09:45:35  58.9
## 2 00018f77f2f0320c…             1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240. 
## 3 000229ec398224ef…             1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199  
## 4 00024acbcdf0a6da…             1 7634da152… 9d7a1d34… 2018-08-15 10:10:18  13.0
## 5 00042b26cf59d7ce…             1 ac6c36230… df560393… 2017-02-13 13:57:51 200. 
## 6 00048cc3ae777c65…             1 ef92defde… 6426d21a… 2017-05-23 03:55:27  21.9
## # ℹ 1 more variable: freight_value <dbl>
head(orders)
## # A tibble: 6 × 8
##   order_id   customer_id order_status order_purchase_times…¹ order_approved_at  
##   <chr>      <chr>       <chr>        <dttm>                 <dttm>             
## 1 e481f51cb… 9ef432eb62… delivered    2017-10-02 10:56:33    2017-10-02 11:07:15
## 2 53cdb2fc8… b0830fb474… delivered    2018-07-24 20:41:37    2018-07-26 03:24:27
## 3 47770eb91… 41ce2a54c0… delivered    2018-08-08 08:38:49    2018-08-08 08:55:23
## 4 949d5b44d… f88197465e… delivered    2017-11-18 19:28:06    2017-11-18 19:45:59
## 5 ad21c59c0… 8ab97904e6… delivered    2018-02-13 21:18:39    2018-02-13 22:20:29
## 6 a4591c265… 503740e9ca… delivered    2017-07-09 21:57:05    2017-07-09 22:10:13
## # ℹ abbreviated name: ¹​order_purchase_timestamp
## # ℹ 3 more variables: order_delivered_carrier_date <dttm>,
## #   order_delivered_customer_date <dttm>, order_estimated_delivery_date <dttm>
head(products)
## # A tibble: 6 × 9
##   product_id    product_category_name product_name_lenght product_description_…¹
##   <chr>         <chr>                               <dbl>                  <dbl>
## 1 1e9e8ef04dbc… perfumaria                             40                    287
## 2 3aa071139cb1… artes                                  44                    276
## 3 96bd76ec8810… esporte_lazer                          46                    250
## 4 cef67bcfe190… bebes                                  27                    261
## 5 9dc1a7de2744… utilidades_domesticas                  37                    402
## 6 41d3672d4792… instrumentos_musicais                  60                    745
## # ℹ abbreviated name: ¹​product_description_lenght
## # ℹ 5 more variables: product_photos_qty <dbl>, product_weight_g <dbl>,
## #   product_length_cm <dbl>, product_height_cm <dbl>, product_width_cm <dbl>
dim(items)
## [1] 112650      7
dim(orders)
## [1] 99441     8
dim(products)
## [1] 32951     9

#Unión de “orders” con “items” utilizando la función “inner_join()”

merge1 <- inner_join(orders, items, by= "order_id")

#Unión de “merge1” con “products” utilizando la función “inner_join()”

merge2 <- inner_join(merge1, products, by= "product_id")

#Eliminar Filas con Valores Faltantes

merge3 <- merge2 %>% 
  drop_na() 

#Eliminación de las variables: “product_name_lenght”, “product_description_lenght”, “product_photos_qty” y “order_purchase_timestamp”

basefinal <- merge3 %>% 
  select(-"product_name_lenght", -"product_description_lenght", -"product_photos_qty", -"order_purchase_timestamp") 
glimpse(basefinal)
## Rows: 108,643
## Columns: 18
## $ order_id                      <chr> "e481f51cbdc54678b7cc49136f2d6af7", "53c…
## $ customer_id                   <chr> "9ef432eb6251297304e76186b10a928d", "b08…
## $ order_status                  <chr> "delivered", "delivered", "delivered", "…
## $ order_approved_at             <dttm> 2017-10-02 11:07:15, 2018-07-26 03:24:2…
## $ order_delivered_carrier_date  <dttm> 2017-10-04 19:55:00, 2018-07-26 14:31:0…
## $ order_delivered_customer_date <dttm> 2017-10-10 21:25:13, 2018-08-07 15:27:4…
## $ order_estimated_delivery_date <dttm> 2017-10-18, 2018-08-13, 2018-09-04, 201…
## $ order_item_id                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1…
## $ product_id                    <chr> "87285b34884572647811a353c7ac498a", "595…
## $ seller_id                     <chr> "3504c0cb71d7fa48d967e0e4c94d59d9", "289…
## $ shipping_limit_date           <dttm> 2017-10-06 11:07:15, 2018-07-30 03:24:2…
## $ price                         <dbl> 29.99, 118.70, 159.90, 45.00, 19.90, 147…
## $ freight_value                 <dbl> 8.72, 22.76, 19.22, 27.20, 8.72, 27.36, …
## $ product_category_name         <chr> "utilidades_domesticas", "perfumaria", "…
## $ product_weight_g              <dbl> 500, 400, 420, 450, 250, 7150, 50, 300, …
## $ product_length_cm             <dbl> 19, 19, 24, 30, 51, 65, 16, 35, 42, 42, …
## $ product_height_cm             <dbl> 8, 13, 19, 10, 15, 10, 16, 35, 41, 12, 1…
## $ product_width_cm              <dbl> 13, 19, 21, 20, 15, 65, 17, 15, 42, 39, …

Exporta “basefinal” como un archivo .csv

write.csv(basefinal, file = "base_integrada_Albarrán_Marco.csv", row.names = FALSE)