if(!require("reticulate")){install.packages("reticulate")}
if(!require("tidyverse")){install.packages("tidyverse")}
if(!require("remotes")){install.packages("remotes")}
if(!require("stringr.plus"))
{ remotes::install_github("johncassil/stringr.plus") }
Se cada .csv estiver em um arquivo zipado próprio, e todos os zipados estiverem na pasta data, apesar de podermos fazer uma leitura direta, apenas adicionando o .zip ao fim do arquivo: . Vamos aproveitar para conhecer como fazer a leitura iterativamente no R:
#the unziped file
df_sellers = readr::read_csv('data/olist_sellers_dataset.csv')
#the zip files
# getting all .csv names
file_names_wd <- list.files("data/", pattern = "*.zip", full.names = T)
#read each file
file_content <- file_names_wd %>%
purrr::map(~read_csv(.))
#build each file name
file_names_df <- file_names_wd %>%
stringr.plus::str_extract_between(
pattern1 = 'olist_', pattern2 = '_dataset.csv.zip') %>%
paste0("df_", .)
#option without stringr.plus
# file_names_df <- file_names_wd %>%
# purrr::map_chr(stringr::str_extract, "_\\w+_") %>%
# purrr::map_chr(stringr::str_sub,2,-2) %>%
# paste0("df_", .)
#assign file_names_df to each file_content
purrr::walk2(.x = file_names_df, .y = file_content,
.f = ~assign(x = .x, value = .y, envir = .GlobalEnv))
#~ walk2() is similar to map2()
df_olist <- df_orders %>%
dplyr::left_join(df_order_items, by = "order_id") %>%
dplyr::full_join(df_order_payments, by = "order_id") %>%
dplyr::full_join(df_order_reviews, by = "order_id") %>%
dplyr::full_join(df_products, by = "product_id") %>%
dplyr::full_join(df_customers, by = "customer_id") %>%
dplyr::full_join(df_sellers, by = "seller_id") %>%
glimpse
## Rows: 119,151
## Columns: 39
## $ order_id <chr> "e481f51cbdc54678b7cc49136f2d6af7", "...
## $ customer_id <chr> "9ef432eb6251297304e76186b10a928d", "...
## $ order_status <chr> "delivered", "delivered", "delivered"...
## $ order_purchase_timestamp <dttm> 2017-10-02 10:56:33, 2017-10-02 10:5...
## $ order_approved_at <dttm> 2017-10-02 11:07:15, 2017-10-02 11:0...
## $ order_delivered_carrier_date <dttm> 2017-10-04 19:55:00, 2017-10-04 19:5...
## $ order_delivered_customer_date <dttm> 2017-10-10 21:25:13, 2017-10-10 21:2...
## $ order_estimated_delivery_date <dttm> 2017-10-18, 2017-10-18, 2017-10-18, ...
## $ order_item_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ product_id <chr> "87285b34884572647811a353c7ac498a", "...
## $ seller_id <chr> "3504c0cb71d7fa48d967e0e4c94d59d9", "...
## $ shipping_limit_date <dttm> 2017-10-06 11:07:15, 2017-10-06 11:0...
## $ price <dbl> 29.99, 29.99, 29.99, 118.70, 159.90, ...
## $ freight_value <dbl> 8.72, 8.72, 8.72, 22.76, 19.22, 27.20...
## $ payment_sequential <dbl> 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
## $ payment_type <chr> "credit_card", "voucher", "voucher", ...
## $ payment_installments <dbl> 1, 1, 1, 1, 3, 1, 1, 6, 1, 3, 1, 1, 1...
## $ payment_value <dbl> 18.12, 2.00, 18.59, 141.46, 179.12, 7...
## $ review_id <chr> "a54f0611adc9ed256b57ede6b6eb5114", "...
## $ review_score <dbl> 4, 4, 4, 4, 5, 5, 5, 4, 2, 5, 1, 5, 5...
## $ review_comment_title <chr> NA, NA, NA, "Muito boa a loja", NA, N...
## $ review_comment_message <chr> "Não testei o produto ainda, mas ele ...
## $ review_creation_date <dttm> 2017-10-11, 2017-10-11, 2017-10-11, ...
## $ review_answer_timestamp <dttm> 2017-10-12 03:43:48, 2017-10-12 03:4...
## $ product_category_name <chr> "utilidades_domesticas", "utilidades_...
## $ product_name_lenght <dbl> 40, 40, 40, 29, 46, 59, 38, 49, NA, 5...
## $ product_description_lenght <dbl> 268, 268, 268, 178, 232, 468, 316, 60...
## $ product_photos_qty <dbl> 4, 4, 4, 1, 1, 3, 4, 1, NA, 1, 2, 1, ...
## $ product_weight_g <dbl> 500, 500, 500, 400, 420, 450, 250, 71...
## $ product_length_cm <dbl> 19, 19, 19, 19, 24, 30, 51, 65, 35, 1...
## $ product_height_cm <dbl> 8, 8, 8, 13, 19, 10, 15, 10, 35, 16, ...
## $ product_width_cm <dbl> 13, 13, 13, 19, 21, 20, 15, 65, 15, 1...
## $ customer_unique_id <chr> "7c396fd4830fd04220f754e42b4e5bff", "...
## $ customer_zip_code_prefix <chr> "03149", "03149", "03149", "47813", "...
## $ customer_city <chr> "sao paulo", "sao paulo", "sao paulo"...
## $ customer_state <chr> "SP", "SP", "SP", "BA", "GO", "RN", "...
## $ seller_zip_code_prefix <chr> "09350", "09350", "09350", "31570", "...
## $ seller_city <chr> "maua", "maua", "maua", "belo horizon...
## $ seller_state <chr> "SP", "SP", "SP", "SP", "SP", "MG", "...