En esta primera actividad se descargaran las bases de datos , se eliminaran los valores nulos , se quitan duplicados, y se consolida una sola tabla
library(readr)
## Warning: package 'readr' was built under R version 4.2.3
df_items<- read_csv("olist_order_items_dataset.csv")
## Rows: 112650 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): order_id, product_id, seller_id
## dbl (3): order_item_id, price, freight_value
## dttm (1): shipping_limit_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_items)
## # A tibble: 6 × 7
## order_id order_item_id product_id seller_id shipping_limit_date price
## <chr> <dbl> <chr> <chr> <dttm> <dbl>
## 1 00010242fe8c5a6d… 1 4244733e0… 48436dad… 2017-09-19 09:45:35 58.9
## 2 00018f77f2f0320c… 1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240.
## 3 000229ec398224ef… 1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199
## 4 00024acbcdf0a6da… 1 7634da152… 9d7a1d34… 2018-08-15 10:10:18 13.0
## 5 00042b26cf59d7ce… 1 ac6c36230… df560393… 2017-02-13 13:57:51 200.
## 6 00048cc3ae777c65… 1 ef92defde… 6426d21a… 2017-05-23 03:55:27 21.9
## # ℹ 1 more variable: freight_value <dbl>
df_orders<- read_csv("olist_orders_dataset.csv")
## Rows: 99441 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): order_id, customer_id, order_status
## dttm (5): order_purchase_timestamp, order_approved_at, order_delivered_carri...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_items)
## # A tibble: 6 × 7
## order_id order_item_id product_id seller_id shipping_limit_date price
## <chr> <dbl> <chr> <chr> <dttm> <dbl>
## 1 00010242fe8c5a6d… 1 4244733e0… 48436dad… 2017-09-19 09:45:35 58.9
## 2 00018f77f2f0320c… 1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240.
## 3 000229ec398224ef… 1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199
## 4 00024acbcdf0a6da… 1 7634da152… 9d7a1d34… 2018-08-15 10:10:18 13.0
## 5 00042b26cf59d7ce… 1 ac6c36230… df560393… 2017-02-13 13:57:51 200.
## 6 00048cc3ae777c65… 1 ef92defde… 6426d21a… 2017-05-23 03:55:27 21.9
## # ℹ 1 more variable: freight_value <dbl>
df_products<- read_csv("olist_products_dataset.csv")
## Rows: 32951 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): product_id, product_category_name
## dbl (7): product_name_lenght, product_description_lenght, product_photos_qty...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_items)
## # A tibble: 6 × 7
## order_id order_item_id product_id seller_id shipping_limit_date price
## <chr> <dbl> <chr> <chr> <dttm> <dbl>
## 1 00010242fe8c5a6d… 1 4244733e0… 48436dad… 2017-09-19 09:45:35 58.9
## 2 00018f77f2f0320c… 1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240.
## 3 000229ec398224ef… 1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199
## 4 00024acbcdf0a6da… 1 7634da152… 9d7a1d34… 2018-08-15 10:10:18 13.0
## 5 00042b26cf59d7ce… 1 ac6c36230… df560393… 2017-02-13 13:57:51 200.
## 6 00048cc3ae777c65… 1 ef92defde… 6426d21a… 2017-05-23 03:55:27 21.9
## # ℹ 1 more variable: freight_value <dbl>
df_items<-na.omit(df_items)
df_products<-na.omit(df_products)
df_orders<-na.omit(df_orders)
# str
str(df_products)
## tibble [32,340 × 9] (S3: tbl_df/tbl/data.frame)
## $ product_id : chr [1:32340] "1e9e8ef04dbcff4541ed26657ea517e5" "3aa071139cb16b67ca9e5dea641aaa2f" "96bd76ec8810374ed1b65e291975717f" "cef67bcfe19066a932b7673e239eb23d" ...
## $ product_category_name : chr [1:32340] "perfumaria" "artes" "esporte_lazer" "bebes" ...
## $ product_name_lenght : num [1:32340] 40 44 46 27 37 60 56 56 57 36 ...
## $ product_description_lenght: num [1:32340] 287 276 250 261 402 ...
## $ product_photos_qty : num [1:32340] 1 1 1 1 4 1 4 2 1 1 ...
## $ product_weight_g : num [1:32340] 225 1000 154 371 625 ...
## $ product_length_cm : num [1:32340] 16 30 18 26 20 38 70 40 27 17 ...
## $ product_height_cm : num [1:32340] 10 18 9 4 17 5 24 8 13 10 ...
## $ product_width_cm : num [1:32340] 14 20 15 26 13 11 44 40 17 12 ...
## - attr(*, "na.action")= 'omit' Named int [1:611] 106 129 146 155 198 245 295 300 348 429 ...
## ..- attr(*, "names")= chr [1:611] "106" "129" "146" "155" ...
str(df_items)
## tibble [112,650 × 7] (S3: tbl_df/tbl/data.frame)
## $ order_id : chr [1:112650] "00010242fe8c5a6d1ba2dd792cb16214" "00018f77f2f0320c557190d7a144bdd3" "000229ec398224ef6ca0657da4fc703e" "00024acbcdf0a6daa1e931b038114c75" ...
## $ order_item_id : num [1:112650] 1 1 1 1 1 1 1 1 1 1 ...
## $ product_id : chr [1:112650] "4244733e06e7ecb4970a6e2683c13e61" "e5f2d52b802189ee658865ca93d83a8f" "c777355d18b72b67abbeef9df44fd0fd" "7634da152a4610f1595efa32f14722fc" ...
## $ seller_id : chr [1:112650] "48436dade18ac8b2bce089ec2a041202" "dd7ddc04e1b6c2c614352b383efe2d36" "5b51032eddd242adc84c38acab88f23d" "9d7a1d34a5052409006425275ba1c2b4" ...
## $ shipping_limit_date: POSIXct[1:112650], format: "2017-09-19 09:45:35" "2017-05-03 11:05:13" ...
## $ price : num [1:112650] 58.9 239.9 199 13 199.9 ...
## $ freight_value : num [1:112650] 13.3 19.9 17.9 12.8 18.1 ...
str(df_orders)
## tibble [96,461 × 8] (S3: tbl_df/tbl/data.frame)
## $ order_id : chr [1:96461] "e481f51cbdc54678b7cc49136f2d6af7" "53cdb2fc8bc7dce0b6741e2150273451" "47770eb9100c2d0c44946d9cf07ec65d" "949d5b44dbf5de918fe9c16f97b45f8a" ...
## $ customer_id : chr [1:96461] "9ef432eb6251297304e76186b10a928d" "b0830fb4747a6c6d20dea0b8c802d7ef" "41ce2a54c0b03bf3443c3d931a367089" "f88197465ea7920adcdbec7375364d82" ...
## $ order_status : chr [1:96461] "delivered" "delivered" "delivered" "delivered" ...
## $ order_purchase_timestamp : POSIXct[1:96461], format: "2017-10-02 10:56:33" "2018-07-24 20:41:37" ...
## $ order_approved_at : POSIXct[1:96461], format: "2017-10-02 11:07:15" "2018-07-26 03:24:27" ...
## $ order_delivered_carrier_date : POSIXct[1:96461], format: "2017-10-04 19:55:00" "2018-07-26 14:31:00" ...
## $ order_delivered_customer_date: POSIXct[1:96461], format: "2017-10-10 21:25:13" "2018-08-07 15:27:45" ...
## $ order_estimated_delivery_date: POSIXct[1:96461], format: "2017-10-18" "2018-08-13" ...
## - attr(*, "na.action")= 'omit' Named int [1:2980] 7 45 104 129 155 163 232 267 300 306 ...
## ..- attr(*, "names")= chr [1:2980] "7" "45" "104" "129" ...
#summary
summary(df_products)
## product_id product_category_name product_name_lenght
## Length:32340 Length:32340 Min. : 5.00
## Class :character Class :character 1st Qu.:42.00
## Mode :character Mode :character Median :51.00
## Mean :48.48
## 3rd Qu.:57.00
## Max. :76.00
## product_description_lenght product_photos_qty product_weight_g
## Min. : 4.0 Min. : 1.000 Min. : 0
## 1st Qu.: 339.0 1st Qu.: 1.000 1st Qu.: 300
## Median : 595.0 Median : 1.000 Median : 700
## Mean : 771.5 Mean : 2.189 Mean : 2277
## 3rd Qu.: 972.0 3rd Qu.: 3.000 3rd Qu.: 1900
## Max. :3992.0 Max. :20.000 Max. :40425
## product_length_cm product_height_cm product_width_cm
## Min. : 7.00 Min. : 2.00 Min. : 6.00
## 1st Qu.: 18.00 1st Qu.: 8.00 1st Qu.: 15.00
## Median : 25.00 Median : 13.00 Median : 20.00
## Mean : 30.85 Mean : 16.96 Mean : 23.21
## 3rd Qu.: 38.00 3rd Qu.: 21.00 3rd Qu.: 30.00
## Max. :105.00 Max. :105.00 Max. :118.00
summary(df_items)
## order_id order_item_id product_id seller_id
## Length:112650 Min. : 1.000 Length:112650 Length:112650
## Class :character 1st Qu.: 1.000 Class :character Class :character
## Mode :character Median : 1.000 Mode :character Mode :character
## Mean : 1.198
## 3rd Qu.: 1.000
## Max. :21.000
## shipping_limit_date price freight_value
## Min. :2016-09-19 00:15:34.00 Min. : 0.85 Min. : 0.00
## 1st Qu.:2017-09-20 20:57:27.50 1st Qu.: 39.90 1st Qu.: 13.08
## Median :2018-01-26 13:59:35.00 Median : 74.99 Median : 16.26
## Mean :2018-01-07 15:36:52.19 Mean : 120.65 Mean : 19.99
## 3rd Qu.:2018-05-10 14:34:00.75 3rd Qu.: 134.90 3rd Qu.: 21.15
## Max. :2020-04-09 22:35:08.00 Max. :6735.00 Max. :409.68
summary(df_orders)
## order_id customer_id order_status
## Length:96461 Length:96461 Length:96461
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## order_purchase_timestamp order_approved_at
## Min. :2016-09-15 12:16:38.00 Min. :2016-09-15 12:16:38.00
## 1st Qu.:2017-09-14 09:28:28.00 1st Qu.:2017-09-14 14:30:14.00
## Median :2018-01-20 19:59:42.00 Median :2018-01-22 13:49:00.00
## Mean :2018-01-01 23:53:26.64 Mean :2018-01-02 10:10:06.48
## 3rd Qu.:2018-05-05 18:33:24.00 3rd Qu.:2018-05-06 10:30:49.00
## Max. :2018-08-29 15:00:37.00 Max. :2018-08-29 15:10:26.00
## order_delivered_carrier_date order_delivered_customer_date
## Min. :2016-10-08 10:34:01.00 Min. :2016-10-11 13:46:32.00
## 1st Qu.:2017-09-18 16:52:19.00 1st Qu.:2017-09-25 22:31:59.00
## Median :2018-01-24 16:19:03.00 Median :2018-02-02 19:50:56.00
## Mean :2018-01-05 05:21:04.50 Mean :2018-01-14 13:17:13.23
## 3rd Qu.:2018-05-08 14:33:00.00 3rd Qu.:2018-05-15 23:08:54.00
## Max. :2018-09-11 19:48:28.00 Max. :2018-10-17 13:22:46.00
## order_estimated_delivery_date
## Min. :2016-10-04 00:00:00.00
## 1st Qu.:2017-10-05 00:00:00.00
## Median :2018-02-16 00:00:00.00
## Mean :2018-01-25 17:33:14.24
## 3rd Qu.:2018-05-28 00:00:00.00
## Max. :2018-10-25 00:00:00.00
#names()
names(df_products)
## [1] "product_id" "product_category_name"
## [3] "product_name_lenght" "product_description_lenght"
## [5] "product_photos_qty" "product_weight_g"
## [7] "product_length_cm" "product_height_cm"
## [9] "product_width_cm"
names(df_items)
## [1] "order_id" "order_item_id" "product_id"
## [4] "seller_id" "shipping_limit_date" "price"
## [7] "freight_value"
names(df_orders)
## [1] "order_id" "customer_id"
## [3] "order_status" "order_purchase_timestamp"
## [5] "order_approved_at" "order_delivered_carrier_date"
## [7] "order_delivered_customer_date" "order_estimated_delivery_date"
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
resultado<-inner_join(df_orders,df_items, by = "order_id")
df_consolidado<-inner_join(resultado,df_products, by = "product_id")
head(df_consolidado)
## # A tibble: 6 × 22
## order_id customer_id order_status order_purchase_times…¹ order_approved_at
## <chr> <chr> <chr> <dttm> <dttm>
## 1 e481f51cb… 9ef432eb62… delivered 2017-10-02 10:56:33 2017-10-02 11:07:15
## 2 53cdb2fc8… b0830fb474… delivered 2018-07-24 20:41:37 2018-07-26 03:24:27
## 3 47770eb91… 41ce2a54c0… delivered 2018-08-08 08:38:49 2018-08-08 08:55:23
## 4 949d5b44d… f88197465e… delivered 2017-11-18 19:28:06 2017-11-18 19:45:59
## 5 ad21c59c0… 8ab97904e6… delivered 2018-02-13 21:18:39 2018-02-13 22:20:29
## 6 a4591c265… 503740e9ca… delivered 2017-07-09 21:57:05 2017-07-09 22:10:13
## # ℹ abbreviated name: ¹order_purchase_timestamp
## # ℹ 17 more variables: order_delivered_carrier_date <dttm>,
## # order_delivered_customer_date <dttm>, order_estimated_delivery_date <dttm>,
## # order_item_id <dbl>, product_id <chr>, seller_id <chr>,
## # shipping_limit_date <dttm>, price <dbl>, freight_value <dbl>,
## # product_category_name <chr>, product_name_lenght <dbl>,
## # product_description_lenght <dbl>, product_photos_qty <dbl>, …
names(df_consolidado)[colSums(is.na(df_consolidado)) > 0]
## character(0)
df_consolidado[duplicated(df_consolidado), ]
## # A tibble: 0 × 22
## # ℹ 22 variables: order_id <chr>, customer_id <chr>, order_status <chr>,
## # order_purchase_timestamp <dttm>, order_approved_at <dttm>,
## # order_delivered_carrier_date <dttm>, order_delivered_customer_date <dttm>,
## # order_estimated_delivery_date <dttm>, order_item_id <dbl>,
## # product_id <chr>, seller_id <chr>, shipping_limit_date <dttm>, price <dbl>,
## # freight_value <dbl>, product_category_name <chr>,
## # product_name_lenght <dbl>, product_description_lenght <dbl>, …