Introducion

En esta primera actividad se descargaran las bases de datos , se eliminaran los valores nulos , se quitan duplicados, y se consolida una sola tabla

library(readr)
## Warning: package 'readr' was built under R version 4.2.3
df_items<- read_csv("olist_order_items_dataset.csv")
## Rows: 112650 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): order_id, product_id, seller_id
## dbl  (3): order_item_id, price, freight_value
## dttm (1): shipping_limit_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_items)
## # A tibble: 6 × 7
##   order_id          order_item_id product_id seller_id shipping_limit_date price
##   <chr>                     <dbl> <chr>      <chr>     <dttm>              <dbl>
## 1 00010242fe8c5a6d…             1 4244733e0… 48436dad… 2017-09-19 09:45:35  58.9
## 2 00018f77f2f0320c…             1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240. 
## 3 000229ec398224ef…             1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199  
## 4 00024acbcdf0a6da…             1 7634da152… 9d7a1d34… 2018-08-15 10:10:18  13.0
## 5 00042b26cf59d7ce…             1 ac6c36230… df560393… 2017-02-13 13:57:51 200. 
## 6 00048cc3ae777c65…             1 ef92defde… 6426d21a… 2017-05-23 03:55:27  21.9
## # ℹ 1 more variable: freight_value <dbl>
df_orders<- read_csv("olist_orders_dataset.csv")
## Rows: 99441 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): order_id, customer_id, order_status
## dttm (5): order_purchase_timestamp, order_approved_at, order_delivered_carri...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_items)
## # A tibble: 6 × 7
##   order_id          order_item_id product_id seller_id shipping_limit_date price
##   <chr>                     <dbl> <chr>      <chr>     <dttm>              <dbl>
## 1 00010242fe8c5a6d…             1 4244733e0… 48436dad… 2017-09-19 09:45:35  58.9
## 2 00018f77f2f0320c…             1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240. 
## 3 000229ec398224ef…             1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199  
## 4 00024acbcdf0a6da…             1 7634da152… 9d7a1d34… 2018-08-15 10:10:18  13.0
## 5 00042b26cf59d7ce…             1 ac6c36230… df560393… 2017-02-13 13:57:51 200. 
## 6 00048cc3ae777c65…             1 ef92defde… 6426d21a… 2017-05-23 03:55:27  21.9
## # ℹ 1 more variable: freight_value <dbl>
df_products<- read_csv("olist_products_dataset.csv")
## Rows: 32951 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): product_id, product_category_name
## dbl (7): product_name_lenght, product_description_lenght, product_photos_qty...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_items)
## # A tibble: 6 × 7
##   order_id          order_item_id product_id seller_id shipping_limit_date price
##   <chr>                     <dbl> <chr>      <chr>     <dttm>              <dbl>
## 1 00010242fe8c5a6d…             1 4244733e0… 48436dad… 2017-09-19 09:45:35  58.9
## 2 00018f77f2f0320c…             1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240. 
## 3 000229ec398224ef…             1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199  
## 4 00024acbcdf0a6da…             1 7634da152… 9d7a1d34… 2018-08-15 10:10:18  13.0
## 5 00042b26cf59d7ce…             1 ac6c36230… df560393… 2017-02-13 13:57:51 200. 
## 6 00048cc3ae777c65…             1 ef92defde… 6426d21a… 2017-05-23 03:55:27  21.9
## # ℹ 1 more variable: freight_value <dbl>

Quitar nulos

df_items<-na.omit(df_items)

df_products<-na.omit(df_products)

df_orders<-na.omit(df_orders)

Analisis de datos

# str
str(df_products)
## tibble [32,340 × 9] (S3: tbl_df/tbl/data.frame)
##  $ product_id                : chr [1:32340] "1e9e8ef04dbcff4541ed26657ea517e5" "3aa071139cb16b67ca9e5dea641aaa2f" "96bd76ec8810374ed1b65e291975717f" "cef67bcfe19066a932b7673e239eb23d" ...
##  $ product_category_name     : chr [1:32340] "perfumaria" "artes" "esporte_lazer" "bebes" ...
##  $ product_name_lenght       : num [1:32340] 40 44 46 27 37 60 56 56 57 36 ...
##  $ product_description_lenght: num [1:32340] 287 276 250 261 402 ...
##  $ product_photos_qty        : num [1:32340] 1 1 1 1 4 1 4 2 1 1 ...
##  $ product_weight_g          : num [1:32340] 225 1000 154 371 625 ...
##  $ product_length_cm         : num [1:32340] 16 30 18 26 20 38 70 40 27 17 ...
##  $ product_height_cm         : num [1:32340] 10 18 9 4 17 5 24 8 13 10 ...
##  $ product_width_cm          : num [1:32340] 14 20 15 26 13 11 44 40 17 12 ...
##  - attr(*, "na.action")= 'omit' Named int [1:611] 106 129 146 155 198 245 295 300 348 429 ...
##   ..- attr(*, "names")= chr [1:611] "106" "129" "146" "155" ...
str(df_items)
## tibble [112,650 × 7] (S3: tbl_df/tbl/data.frame)
##  $ order_id           : chr [1:112650] "00010242fe8c5a6d1ba2dd792cb16214" "00018f77f2f0320c557190d7a144bdd3" "000229ec398224ef6ca0657da4fc703e" "00024acbcdf0a6daa1e931b038114c75" ...
##  $ order_item_id      : num [1:112650] 1 1 1 1 1 1 1 1 1 1 ...
##  $ product_id         : chr [1:112650] "4244733e06e7ecb4970a6e2683c13e61" "e5f2d52b802189ee658865ca93d83a8f" "c777355d18b72b67abbeef9df44fd0fd" "7634da152a4610f1595efa32f14722fc" ...
##  $ seller_id          : chr [1:112650] "48436dade18ac8b2bce089ec2a041202" "dd7ddc04e1b6c2c614352b383efe2d36" "5b51032eddd242adc84c38acab88f23d" "9d7a1d34a5052409006425275ba1c2b4" ...
##  $ shipping_limit_date: POSIXct[1:112650], format: "2017-09-19 09:45:35" "2017-05-03 11:05:13" ...
##  $ price              : num [1:112650] 58.9 239.9 199 13 199.9 ...
##  $ freight_value      : num [1:112650] 13.3 19.9 17.9 12.8 18.1 ...
str(df_orders)
## tibble [96,461 × 8] (S3: tbl_df/tbl/data.frame)
##  $ order_id                     : chr [1:96461] "e481f51cbdc54678b7cc49136f2d6af7" "53cdb2fc8bc7dce0b6741e2150273451" "47770eb9100c2d0c44946d9cf07ec65d" "949d5b44dbf5de918fe9c16f97b45f8a" ...
##  $ customer_id                  : chr [1:96461] "9ef432eb6251297304e76186b10a928d" "b0830fb4747a6c6d20dea0b8c802d7ef" "41ce2a54c0b03bf3443c3d931a367089" "f88197465ea7920adcdbec7375364d82" ...
##  $ order_status                 : chr [1:96461] "delivered" "delivered" "delivered" "delivered" ...
##  $ order_purchase_timestamp     : POSIXct[1:96461], format: "2017-10-02 10:56:33" "2018-07-24 20:41:37" ...
##  $ order_approved_at            : POSIXct[1:96461], format: "2017-10-02 11:07:15" "2018-07-26 03:24:27" ...
##  $ order_delivered_carrier_date : POSIXct[1:96461], format: "2017-10-04 19:55:00" "2018-07-26 14:31:00" ...
##  $ order_delivered_customer_date: POSIXct[1:96461], format: "2017-10-10 21:25:13" "2018-08-07 15:27:45" ...
##  $ order_estimated_delivery_date: POSIXct[1:96461], format: "2017-10-18" "2018-08-13" ...
##  - attr(*, "na.action")= 'omit' Named int [1:2980] 7 45 104 129 155 163 232 267 300 306 ...
##   ..- attr(*, "names")= chr [1:2980] "7" "45" "104" "129" ...
#summary

summary(df_products)
##   product_id        product_category_name product_name_lenght
##  Length:32340       Length:32340          Min.   : 5.00      
##  Class :character   Class :character      1st Qu.:42.00      
##  Mode  :character   Mode  :character      Median :51.00      
##                                           Mean   :48.48      
##                                           3rd Qu.:57.00      
##                                           Max.   :76.00      
##  product_description_lenght product_photos_qty product_weight_g
##  Min.   :   4.0             Min.   : 1.000     Min.   :    0   
##  1st Qu.: 339.0             1st Qu.: 1.000     1st Qu.:  300   
##  Median : 595.0             Median : 1.000     Median :  700   
##  Mean   : 771.5             Mean   : 2.189     Mean   : 2277   
##  3rd Qu.: 972.0             3rd Qu.: 3.000     3rd Qu.: 1900   
##  Max.   :3992.0             Max.   :20.000     Max.   :40425   
##  product_length_cm product_height_cm product_width_cm
##  Min.   :  7.00    Min.   :  2.00    Min.   :  6.00  
##  1st Qu.: 18.00    1st Qu.:  8.00    1st Qu.: 15.00  
##  Median : 25.00    Median : 13.00    Median : 20.00  
##  Mean   : 30.85    Mean   : 16.96    Mean   : 23.21  
##  3rd Qu.: 38.00    3rd Qu.: 21.00    3rd Qu.: 30.00  
##  Max.   :105.00    Max.   :105.00    Max.   :118.00
summary(df_items)
##    order_id         order_item_id     product_id         seller_id        
##  Length:112650      Min.   : 1.000   Length:112650      Length:112650     
##  Class :character   1st Qu.: 1.000   Class :character   Class :character  
##  Mode  :character   Median : 1.000   Mode  :character   Mode  :character  
##                     Mean   : 1.198                                        
##                     3rd Qu.: 1.000                                        
##                     Max.   :21.000                                        
##  shipping_limit_date                  price         freight_value   
##  Min.   :2016-09-19 00:15:34.00   Min.   :   0.85   Min.   :  0.00  
##  1st Qu.:2017-09-20 20:57:27.50   1st Qu.:  39.90   1st Qu.: 13.08  
##  Median :2018-01-26 13:59:35.00   Median :  74.99   Median : 16.26  
##  Mean   :2018-01-07 15:36:52.19   Mean   : 120.65   Mean   : 19.99  
##  3rd Qu.:2018-05-10 14:34:00.75   3rd Qu.: 134.90   3rd Qu.: 21.15  
##  Max.   :2020-04-09 22:35:08.00   Max.   :6735.00   Max.   :409.68
summary(df_orders)
##    order_id         customer_id        order_status      
##  Length:96461       Length:96461       Length:96461      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  order_purchase_timestamp         order_approved_at               
##  Min.   :2016-09-15 12:16:38.00   Min.   :2016-09-15 12:16:38.00  
##  1st Qu.:2017-09-14 09:28:28.00   1st Qu.:2017-09-14 14:30:14.00  
##  Median :2018-01-20 19:59:42.00   Median :2018-01-22 13:49:00.00  
##  Mean   :2018-01-01 23:53:26.64   Mean   :2018-01-02 10:10:06.48  
##  3rd Qu.:2018-05-05 18:33:24.00   3rd Qu.:2018-05-06 10:30:49.00  
##  Max.   :2018-08-29 15:00:37.00   Max.   :2018-08-29 15:10:26.00  
##  order_delivered_carrier_date     order_delivered_customer_date   
##  Min.   :2016-10-08 10:34:01.00   Min.   :2016-10-11 13:46:32.00  
##  1st Qu.:2017-09-18 16:52:19.00   1st Qu.:2017-09-25 22:31:59.00  
##  Median :2018-01-24 16:19:03.00   Median :2018-02-02 19:50:56.00  
##  Mean   :2018-01-05 05:21:04.50   Mean   :2018-01-14 13:17:13.23  
##  3rd Qu.:2018-05-08 14:33:00.00   3rd Qu.:2018-05-15 23:08:54.00  
##  Max.   :2018-09-11 19:48:28.00   Max.   :2018-10-17 13:22:46.00  
##  order_estimated_delivery_date   
##  Min.   :2016-10-04 00:00:00.00  
##  1st Qu.:2017-10-05 00:00:00.00  
##  Median :2018-02-16 00:00:00.00  
##  Mean   :2018-01-25 17:33:14.24  
##  3rd Qu.:2018-05-28 00:00:00.00  
##  Max.   :2018-10-25 00:00:00.00
#names()

names(df_products)
## [1] "product_id"                 "product_category_name"     
## [3] "product_name_lenght"        "product_description_lenght"
## [5] "product_photos_qty"         "product_weight_g"          
## [7] "product_length_cm"          "product_height_cm"         
## [9] "product_width_cm"
names(df_items)
## [1] "order_id"            "order_item_id"       "product_id"         
## [4] "seller_id"           "shipping_limit_date" "price"              
## [7] "freight_value"
names(df_orders)
## [1] "order_id"                      "customer_id"                  
## [3] "order_status"                  "order_purchase_timestamp"     
## [5] "order_approved_at"             "order_delivered_carrier_date" 
## [7] "order_delivered_customer_date" "order_estimated_delivery_date"

Join

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
resultado<-inner_join(df_orders,df_items, by = "order_id") 

df_consolidado<-inner_join(resultado,df_products, by = "product_id") 

head(df_consolidado)
## # A tibble: 6 × 22
##   order_id   customer_id order_status order_purchase_times…¹ order_approved_at  
##   <chr>      <chr>       <chr>        <dttm>                 <dttm>             
## 1 e481f51cb… 9ef432eb62… delivered    2017-10-02 10:56:33    2017-10-02 11:07:15
## 2 53cdb2fc8… b0830fb474… delivered    2018-07-24 20:41:37    2018-07-26 03:24:27
## 3 47770eb91… 41ce2a54c0… delivered    2018-08-08 08:38:49    2018-08-08 08:55:23
## 4 949d5b44d… f88197465e… delivered    2017-11-18 19:28:06    2017-11-18 19:45:59
## 5 ad21c59c0… 8ab97904e6… delivered    2018-02-13 21:18:39    2018-02-13 22:20:29
## 6 a4591c265… 503740e9ca… delivered    2017-07-09 21:57:05    2017-07-09 22:10:13
## # ℹ abbreviated name: ¹​order_purchase_timestamp
## # ℹ 17 more variables: order_delivered_carrier_date <dttm>,
## #   order_delivered_customer_date <dttm>, order_estimated_delivery_date <dttm>,
## #   order_item_id <dbl>, product_id <chr>, seller_id <chr>,
## #   shipping_limit_date <dttm>, price <dbl>, freight_value <dbl>,
## #   product_category_name <chr>, product_name_lenght <dbl>,
## #   product_description_lenght <dbl>, product_photos_qty <dbl>, …
names(df_consolidado)[colSums(is.na(df_consolidado)) > 0]
## character(0)
df_consolidado[duplicated(df_consolidado), ]
## # A tibble: 0 × 22
## # ℹ 22 variables: order_id <chr>, customer_id <chr>, order_status <chr>,
## #   order_purchase_timestamp <dttm>, order_approved_at <dttm>,
## #   order_delivered_carrier_date <dttm>, order_delivered_customer_date <dttm>,
## #   order_estimated_delivery_date <dttm>, order_item_id <dbl>,
## #   product_id <chr>, seller_id <chr>, shipping_limit_date <dttm>, price <dbl>,
## #   freight_value <dbl>, product_category_name <chr>,
## #   product_name_lenght <dbl>, product_description_lenght <dbl>, …