:: Libraries

R

if(!require("reticulate")){install.packages("reticulate")}
if(!require("tidyverse")){install.packages("tidyverse")}
if(!require("remotes")){install.packages("remotes")}
if(!require("stringr.plus")){
    remotes::install_github("johncassil/stringr.plus")}

Py

import numpy as np 
import pandas as pd

:: Read

A partir dos arquivos .csv

Py

#the unziped file
df_sellers = pd.read_csv('data/olist_sellers_dataset.csv')

#the ziped files ~ add .zap
df_customers = pd.read_csv('data/olist_customers_dataset.csv.zip')
df_geolocation = pd.read_csv('data/olist_geolocation_dataset.csv.zip')
df_order_items = pd.read_csv('data/olist_order_items_dataset.csv.zip')
df_order_payments = pd.read_csv('data/olist_order_payments_dataset.csv.zip')
df_order_reviews = pd.read_csv('data/olist_order_reviews_dataset.csv.zip')
df_orders = pd.read_csv('data/olist_orders_dataset.csv.zip')
df_products = pd.read_csv('data/olist_products_dataset.csv.zip')

movendo para o ambiente do R

df_sellers <- py$df_sellers
df_customers <- py$df_customers 
df_geolocation <- py$df_geolocation 
df_order_items <- py$df_order_items 
df_order_payments <- py$df_order_payments 
df_order_reviews <- py$df_order_reviews
df_orders <- py$df_orders 
df_products <- py$df_products 

R

df_customers_r <- readr::read_csv('data/olist_customers_dataset.csv.zip')

movendo para o ambiente do Py

r.df_customers_r.head()
##                         customer_id  ... customer_state
## 0  06b8999e2fba1a1fbc88172c00ba8bc7  ...             SP
## 1  18955e83d337fd6b2def6b18a428ac77  ...             SP
## 2  4e7b3e00288586ebd08712fdd0374a03  ...             SP
## 3  b2b6027bc5c5109e529d4dc6358b12c3  ...             SP
## 4  4f2d8ab171c80ec8364f7c12e35b23ad  ...             SP
## 
## [5 rows x 5 columns]

:: Join

df_olist <- df_orders %>% 
    left_join(df_order_items, by = "order_id") %>% 
    full_join(df_order_payments, by = "order_id") %>% 
    full_join(df_order_reviews, by = "order_id") %>% 
    full_join(df_products, by = "product_id") %>% 
    full_join(df_customers, by = "customer_id") %>% 
    full_join(df_sellers, by = "seller_id") %>% 
    glimpse
## Rows: 119,151
## Columns: 39
## $ order_id                      <chr> "e481f51cbdc54678b7cc49136f2d6af7", "...
## $ customer_id                   <chr> "9ef432eb6251297304e76186b10a928d", "...
## $ order_status                  <chr> "delivered", "delivered", "delivered"...
## $ order_purchase_timestamp      <chr> "2017-10-02 10:56:33", "2017-10-02 10...
## $ order_approved_at             <list> ["2017-10-02 11:07:15", "2017-10-02 ...
## $ order_delivered_carrier_date  <list> ["2017-10-04 19:55:00", "2017-10-04 ...
## $ order_delivered_customer_date <list> ["2017-10-10 21:25:13", "2017-10-10 ...
## $ order_estimated_delivery_date <chr> "2017-10-18 00:00:00", "2017-10-18 00...
## $ order_item_id                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ product_id                    <chr> "87285b34884572647811a353c7ac498a", "...
## $ seller_id                     <chr> "3504c0cb71d7fa48d967e0e4c94d59d9", "...
## $ shipping_limit_date           <chr> "2017-10-06 11:07:15", "2017-10-06 11...
## $ price                         <dbl> 29.99, 29.99, 29.99, 118.70, 159.90, ...
## $ freight_value                 <dbl> 8.72, 8.72, 8.72, 22.76, 19.22, 27.20...
## $ payment_sequential            <dbl> 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
## $ payment_type                  <chr> "credit_card", "voucher", "voucher", ...
## $ payment_installments          <dbl> 1, 1, 1, 1, 3, 1, 1, 6, 1, 3, 1, 1, 1...
## $ payment_value                 <dbl> 18.12, 2.00, 18.59, 141.46, 179.12, 7...
## $ review_id                     <chr> "a54f0611adc9ed256b57ede6b6eb5114", "...
## $ review_score                  <dbl> 4, 4, 4, 4, 5, 5, 5, 4, 2, 5, 1, 5, 5...
## $ review_comment_title          <list> [NaN, NaN, NaN, "Muito boa a loja", ...
## $ review_comment_message        <list> ["Não testei o produto ainda, mas el...
## $ review_creation_date          <chr> "2017-10-11 00:00:00", "2017-10-11 00...
## $ review_answer_timestamp       <chr> "2017-10-12 03:43:48", "2017-10-12 03...
## $ product_category_name         <list> ["utilidades_domesticas", "utilidade...
## $ product_name_lenght           <dbl> 40, 40, 40, 29, 46, 59, 38, 49, NaN, ...
## $ product_description_lenght    <dbl> 268, 268, 268, 178, 232, 468, 316, 60...
## $ product_photos_qty            <dbl> 4, 4, 4, 1, 1, 3, 4, 1, NaN, 1, 2, 1,...
## $ product_weight_g              <dbl> 500, 500, 500, 400, 420, 450, 250, 71...
## $ product_length_cm             <dbl> 19, 19, 19, 19, 24, 30, 51, 65, 35, 1...
## $ product_height_cm             <dbl> 8, 8, 8, 13, 19, 10, 15, 10, 35, 16, ...
## $ product_width_cm              <dbl> 13, 13, 13, 19, 21, 20, 15, 65, 15, 1...
## $ customer_unique_id            <chr> "7c396fd4830fd04220f754e42b4e5bff", "...
## $ customer_zip_code_prefix      <dbl> 3149, 3149, 3149, 47813, 75265, 59296...
## $ customer_city                 <chr> "sao paulo", "sao paulo", "sao paulo"...
## $ customer_state                <chr> "SP", "SP", "SP", "BA", "GO", "RN", "...
## $ seller_zip_code_prefix        <dbl> 9350, 9350, 9350, 31570, 14840, 31842...
## $ seller_city                   <chr> "maua", "maua", "maua", "belo horizon...
## $ seller_state                  <chr> "SP", "SP", "SP", "SP", "SP", "MG", "...

.