:: Libraries

if(!require("reticulate")){install.packages("reticulate")}
if(!require("tidyverse")){install.packages("tidyverse")}
if(!require("remotes")){install.packages("remotes")}
if(!require("stringr.plus"))
    { remotes::install_github("johncassil/stringr.plus") }

:: Read

Se cada .csv estiver em um arquivo zipado próprio, e todos os zipados estiverem na pasta data, apesar de podermos fazer uma leitura direta, apenas adicionando o .zip ao fim do arquivo: . Vamos aproveitar para conhecer como fazer a leitura iterativamente no R:

#the unziped file
    df_sellers = readr::read_csv('data/olist_sellers_dataset.csv')

#the zip files
    # getting all .csv names
    file_names_wd <- list.files("data/", pattern = "*.zip", full.names = T)
    
    #read each file
    file_content <- file_names_wd %>% 
        purrr::map(~read_csv(.))
    
    #build each file name
    file_names_df <- file_names_wd %>%
        stringr.plus::str_extract_between(
            pattern1 = 'olist_', pattern2 = '_dataset.csv.zip')  %>%
        paste0("df_", .)
    
        #option without stringr.plus
            # file_names_df <- file_names_wd %>%
            #     purrr::map_chr(stringr::str_extract, "_\\w+_") %>%
            #     purrr::map_chr(stringr::str_sub,2,-2) %>%
            #     paste0("df_", .)
    
    
    #assign file_names_df to each file_content
    purrr::walk2(.x = file_names_df, .y = file_content,
          .f = ~assign(x = .x, value = .y, envir = .GlobalEnv))
    
        #~ walk2() is similar to map2()

:: Join

df_olist <- df_orders %>% 
    dplyr::left_join(df_order_items, by = "order_id") %>% 
    dplyr::full_join(df_order_payments, by = "order_id") %>% 
    dplyr::full_join(df_order_reviews, by = "order_id") %>% 
    dplyr::full_join(df_products, by = "product_id") %>% 
    dplyr::full_join(df_customers, by = "customer_id") %>% 
    dplyr::full_join(df_sellers, by = "seller_id") %>% 
    glimpse
## Rows: 119,151
## Columns: 39
## $ order_id                      <chr> "e481f51cbdc54678b7cc49136f2d6af7", "...
## $ customer_id                   <chr> "9ef432eb6251297304e76186b10a928d", "...
## $ order_status                  <chr> "delivered", "delivered", "delivered"...
## $ order_purchase_timestamp      <dttm> 2017-10-02 10:56:33, 2017-10-02 10:5...
## $ order_approved_at             <dttm> 2017-10-02 11:07:15, 2017-10-02 11:0...
## $ order_delivered_carrier_date  <dttm> 2017-10-04 19:55:00, 2017-10-04 19:5...
## $ order_delivered_customer_date <dttm> 2017-10-10 21:25:13, 2017-10-10 21:2...
## $ order_estimated_delivery_date <dttm> 2017-10-18, 2017-10-18, 2017-10-18, ...
## $ order_item_id                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ product_id                    <chr> "87285b34884572647811a353c7ac498a", "...
## $ seller_id                     <chr> "3504c0cb71d7fa48d967e0e4c94d59d9", "...
## $ shipping_limit_date           <dttm> 2017-10-06 11:07:15, 2017-10-06 11:0...
## $ price                         <dbl> 29.99, 29.99, 29.99, 118.70, 159.90, ...
## $ freight_value                 <dbl> 8.72, 8.72, 8.72, 22.76, 19.22, 27.20...
## $ payment_sequential            <dbl> 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
## $ payment_type                  <chr> "credit_card", "voucher", "voucher", ...
## $ payment_installments          <dbl> 1, 1, 1, 1, 3, 1, 1, 6, 1, 3, 1, 1, 1...
## $ payment_value                 <dbl> 18.12, 2.00, 18.59, 141.46, 179.12, 7...
## $ review_id                     <chr> "a54f0611adc9ed256b57ede6b6eb5114", "...
## $ review_score                  <dbl> 4, 4, 4, 4, 5, 5, 5, 4, 2, 5, 1, 5, 5...
## $ review_comment_title          <chr> NA, NA, NA, "Muito boa a loja", NA, N...
## $ review_comment_message        <chr> "Não testei o produto ainda, mas ele ...
## $ review_creation_date          <dttm> 2017-10-11, 2017-10-11, 2017-10-11, ...
## $ review_answer_timestamp       <dttm> 2017-10-12 03:43:48, 2017-10-12 03:4...
## $ product_category_name         <chr> "utilidades_domesticas", "utilidades_...
## $ product_name_lenght           <dbl> 40, 40, 40, 29, 46, 59, 38, 49, NA, 5...
## $ product_description_lenght    <dbl> 268, 268, 268, 178, 232, 468, 316, 60...
## $ product_photos_qty            <dbl> 4, 4, 4, 1, 1, 3, 4, 1, NA, 1, 2, 1, ...
## $ product_weight_g              <dbl> 500, 500, 500, 400, 420, 450, 250, 71...
## $ product_length_cm             <dbl> 19, 19, 19, 19, 24, 30, 51, 65, 35, 1...
## $ product_height_cm             <dbl> 8, 8, 8, 13, 19, 10, 15, 10, 35, 16, ...
## $ product_width_cm              <dbl> 13, 13, 13, 19, 21, 20, 15, 65, 15, 1...
## $ customer_unique_id            <chr> "7c396fd4830fd04220f754e42b4e5bff", "...
## $ customer_zip_code_prefix      <chr> "03149", "03149", "03149", "47813", "...
## $ customer_city                 <chr> "sao paulo", "sao paulo", "sao paulo"...
## $ customer_state                <chr> "SP", "SP", "SP", "BA", "GO", "RN", "...
## $ seller_zip_code_prefix        <chr> "09350", "09350", "09350", "31570", "...
## $ seller_city                   <chr> "maua", "maua", "maua", "belo horizon...
## $ seller_state                  <chr> "SP", "SP", "SP", "SP", "SP", "MG", "...

.