library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(skimr)
## Warning: package 'skimr' was built under R version 4.5.2
library(purrr)
setwd("C:/Users/marco.albarran/Desktop/Modelos Predictivos con Lenguaje R")
getwd()
## [1] "C:/Users/marco.albarran/Desktop/Modelos Predictivos con Lenguaje R"
items <- read_csv("olist_order_items_dataset.csv")
## Rows: 112650 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): order_id, product_id, seller_id
## dbl (3): order_item_id, price, freight_value
## dttm (1): shipping_limit_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
orders <- read_csv("olist_orders_dataset.csv")
## Rows: 99441 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): order_id, customer_id, order_status
## dttm (5): order_purchase_timestamp, order_approved_at, order_delivered_carri...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
products <- read_csv("olist_products_dataset.csv")
## Rows: 32951 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): product_id, product_category_name
## dbl (7): product_name_lenght, product_description_lenght, product_photos_qty...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(c(items, orders, products))
## List of 24
## $ order_id : chr [1:112650] "00010242fe8c5a6d1ba2dd792cb16214" "00018f77f2f0320c557190d7a144bdd3" "000229ec398224ef6ca0657da4fc703e" "00024acbcdf0a6daa1e931b038114c75" ...
## $ order_item_id : num [1:112650] 1 1 1 1 1 1 1 1 1 1 ...
## $ product_id : chr [1:112650] "4244733e06e7ecb4970a6e2683c13e61" "e5f2d52b802189ee658865ca93d83a8f" "c777355d18b72b67abbeef9df44fd0fd" "7634da152a4610f1595efa32f14722fc" ...
## $ seller_id : chr [1:112650] "48436dade18ac8b2bce089ec2a041202" "dd7ddc04e1b6c2c614352b383efe2d36" "5b51032eddd242adc84c38acab88f23d" "9d7a1d34a5052409006425275ba1c2b4" ...
## $ shipping_limit_date : POSIXct[1:112650], format: "2017-09-19 09:45:35" "2017-05-03 11:05:13" ...
## $ price : num [1:112650] 58.9 239.9 199 13 199.9 ...
## $ freight_value : num [1:112650] 13.3 19.9 17.9 12.8 18.1 ...
## $ order_id : chr [1:99441] "e481f51cbdc54678b7cc49136f2d6af7" "53cdb2fc8bc7dce0b6741e2150273451" "47770eb9100c2d0c44946d9cf07ec65d" "949d5b44dbf5de918fe9c16f97b45f8a" ...
## $ customer_id : chr [1:99441] "9ef432eb6251297304e76186b10a928d" "b0830fb4747a6c6d20dea0b8c802d7ef" "41ce2a54c0b03bf3443c3d931a367089" "f88197465ea7920adcdbec7375364d82" ...
## $ order_status : chr [1:99441] "delivered" "delivered" "delivered" "delivered" ...
## $ order_purchase_timestamp : POSIXct[1:99441], format: "2017-10-02 10:56:33" "2018-07-24 20:41:37" ...
## $ order_approved_at : POSIXct[1:99441], format: "2017-10-02 11:07:15" "2018-07-26 03:24:27" ...
## $ order_delivered_carrier_date : POSIXct[1:99441], format: "2017-10-04 19:55:00" "2018-07-26 14:31:00" ...
## $ order_delivered_customer_date: POSIXct[1:99441], format: "2017-10-10 21:25:13" "2018-08-07 15:27:45" ...
## $ order_estimated_delivery_date: POSIXct[1:99441], format: "2017-10-18" "2018-08-13" ...
## $ product_id : chr [1:32951] "1e9e8ef04dbcff4541ed26657ea517e5" "3aa071139cb16b67ca9e5dea641aaa2f" "96bd76ec8810374ed1b65e291975717f" "cef67bcfe19066a932b7673e239eb23d" ...
## $ product_category_name : chr [1:32951] "perfumaria" "artes" "esporte_lazer" "bebes" ...
## $ product_name_lenght : num [1:32951] 40 44 46 27 37 60 56 56 57 36 ...
## $ product_description_lenght : num [1:32951] 287 276 250 261 402 ...
## $ product_photos_qty : num [1:32951] 1 1 1 1 4 1 4 2 1 1 ...
## $ product_weight_g : num [1:32951] 225 1000 154 371 625 ...
## $ product_length_cm : num [1:32951] 16 30 18 26 20 38 70 40 27 17 ...
## $ product_height_cm : num [1:32951] 10 18 9 4 17 5 24 8 13 10 ...
## $ product_width_cm : num [1:32951] 14 20 15 26 13 11 44 40 17 12 ...
summary(c(items,orders,products))
## Length Class Mode
## order_id 112650 -none- character
## order_item_id 112650 -none- numeric
## product_id 112650 -none- character
## seller_id 112650 -none- character
## shipping_limit_date 112650 POSIXct numeric
## price 112650 -none- numeric
## freight_value 112650 -none- numeric
## order_id 99441 -none- character
## customer_id 99441 -none- character
## order_status 99441 -none- character
## order_purchase_timestamp 99441 POSIXct numeric
## order_approved_at 99441 POSIXct numeric
## order_delivered_carrier_date 99441 POSIXct numeric
## order_delivered_customer_date 99441 POSIXct numeric
## order_estimated_delivery_date 99441 POSIXct numeric
## product_id 32951 -none- character
## product_category_name 32951 -none- character
## product_name_lenght 32951 -none- numeric
## product_description_lenght 32951 -none- numeric
## product_photos_qty 32951 -none- numeric
## product_weight_g 32951 -none- numeric
## product_length_cm 32951 -none- numeric
## product_height_cm 32951 -none- numeric
## product_width_cm 32951 -none- numeric
names(c(items,orders,products))
## [1] "order_id" "order_item_id"
## [3] "product_id" "seller_id"
## [5] "shipping_limit_date" "price"
## [7] "freight_value" "order_id"
## [9] "customer_id" "order_status"
## [11] "order_purchase_timestamp" "order_approved_at"
## [13] "order_delivered_carrier_date" "order_delivered_customer_date"
## [15] "order_estimated_delivery_date" "product_id"
## [17] "product_category_name" "product_name_lenght"
## [19] "product_description_lenght" "product_photos_qty"
## [21] "product_weight_g" "product_length_cm"
## [23] "product_height_cm" "product_width_cm"
head(items)
## # A tibble: 6 × 7
## order_id order_item_id product_id seller_id shipping_limit_date price
## <chr> <dbl> <chr> <chr> <dttm> <dbl>
## 1 00010242fe8c5a6d… 1 4244733e0… 48436dad… 2017-09-19 09:45:35 58.9
## 2 00018f77f2f0320c… 1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240.
## 3 000229ec398224ef… 1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199
## 4 00024acbcdf0a6da… 1 7634da152… 9d7a1d34… 2018-08-15 10:10:18 13.0
## 5 00042b26cf59d7ce… 1 ac6c36230… df560393… 2017-02-13 13:57:51 200.
## 6 00048cc3ae777c65… 1 ef92defde… 6426d21a… 2017-05-23 03:55:27 21.9
## # ℹ 1 more variable: freight_value <dbl>
head(orders)
## # A tibble: 6 × 8
## order_id customer_id order_status order_purchase_times…¹ order_approved_at
## <chr> <chr> <chr> <dttm> <dttm>
## 1 e481f51cb… 9ef432eb62… delivered 2017-10-02 10:56:33 2017-10-02 11:07:15
## 2 53cdb2fc8… b0830fb474… delivered 2018-07-24 20:41:37 2018-07-26 03:24:27
## 3 47770eb91… 41ce2a54c0… delivered 2018-08-08 08:38:49 2018-08-08 08:55:23
## 4 949d5b44d… f88197465e… delivered 2017-11-18 19:28:06 2017-11-18 19:45:59
## 5 ad21c59c0… 8ab97904e6… delivered 2018-02-13 21:18:39 2018-02-13 22:20:29
## 6 a4591c265… 503740e9ca… delivered 2017-07-09 21:57:05 2017-07-09 22:10:13
## # ℹ abbreviated name: ¹order_purchase_timestamp
## # ℹ 3 more variables: order_delivered_carrier_date <dttm>,
## # order_delivered_customer_date <dttm>, order_estimated_delivery_date <dttm>
head(products)
## # A tibble: 6 × 9
## product_id product_category_name product_name_lenght product_description_…¹
## <chr> <chr> <dbl> <dbl>
## 1 1e9e8ef04dbc… perfumaria 40 287
## 2 3aa071139cb1… artes 44 276
## 3 96bd76ec8810… esporte_lazer 46 250
## 4 cef67bcfe190… bebes 27 261
## 5 9dc1a7de2744… utilidades_domesticas 37 402
## 6 41d3672d4792… instrumentos_musicais 60 745
## # ℹ abbreviated name: ¹product_description_lenght
## # ℹ 5 more variables: product_photos_qty <dbl>, product_weight_g <dbl>,
## # product_length_cm <dbl>, product_height_cm <dbl>, product_width_cm <dbl>
dim(items)
## [1] 112650 7
dim(orders)
## [1] 99441 8
dim(products)
## [1] 32951 9
#Unión de “orders” con “items” utilizando la función “inner_join()”
merge1 <- inner_join(orders, items, by= "order_id")
#Unión de “merge1” con “products” utilizando la función “inner_join()”
merge2 <- inner_join(merge1, products, by= "product_id")
#Eliminar Filas con Valores Faltantes
merge3 <- merge2 %>%
drop_na()
#Eliminación de las variables: “product_name_lenght”, “product_description_lenght”, “product_photos_qty” y “order_purchase_timestamp”
basefinal <- merge3 %>%
select(-"product_name_lenght", -"product_description_lenght", -"product_photos_qty", -"order_purchase_timestamp")
glimpse(basefinal)
## Rows: 108,643
## Columns: 18
## $ order_id <chr> "e481f51cbdc54678b7cc49136f2d6af7", "53c…
## $ customer_id <chr> "9ef432eb6251297304e76186b10a928d", "b08…
## $ order_status <chr> "delivered", "delivered", "delivered", "…
## $ order_approved_at <dttm> 2017-10-02 11:07:15, 2018-07-26 03:24:2…
## $ order_delivered_carrier_date <dttm> 2017-10-04 19:55:00, 2018-07-26 14:31:0…
## $ order_delivered_customer_date <dttm> 2017-10-10 21:25:13, 2018-08-07 15:27:4…
## $ order_estimated_delivery_date <dttm> 2017-10-18, 2018-08-13, 2018-09-04, 201…
## $ order_item_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1…
## $ product_id <chr> "87285b34884572647811a353c7ac498a", "595…
## $ seller_id <chr> "3504c0cb71d7fa48d967e0e4c94d59d9", "289…
## $ shipping_limit_date <dttm> 2017-10-06 11:07:15, 2018-07-30 03:24:2…
## $ price <dbl> 29.99, 118.70, 159.90, 45.00, 19.90, 147…
## $ freight_value <dbl> 8.72, 22.76, 19.22, 27.20, 8.72, 27.36, …
## $ product_category_name <chr> "utilidades_domesticas", "perfumaria", "…
## $ product_weight_g <dbl> 500, 400, 420, 450, 250, 7150, 50, 300, …
## $ product_length_cm <dbl> 19, 19, 24, 30, 51, 65, 16, 35, 42, 42, …
## $ product_height_cm <dbl> 8, 13, 19, 10, 15, 10, 16, 35, 41, 12, 1…
## $ product_width_cm <dbl> 13, 19, 21, 20, 15, 65, 17, 15, 42, 39, …
write.csv(basefinal, file = "base_integrada_Albarrán_Marco.csv", row.names = FALSE)