if(!require("reticulate")){install.packages("reticulate")}
if(!require("tidyverse")){install.packages("tidyverse")}
if(!require("remotes")){install.packages("remotes")}
if(!require("stringr.plus")){
remotes::install_github("johncassil/stringr.plus")}
import numpy as np
import pandas as pd
A partir dos arquivos .csv
#the unziped file
df_sellers = pd.read_csv('data/olist_sellers_dataset.csv')
#the ziped files ~ add .zap
df_customers = pd.read_csv('data/olist_customers_dataset.csv.zip')
df_geolocation = pd.read_csv('data/olist_geolocation_dataset.csv.zip')
df_order_items = pd.read_csv('data/olist_order_items_dataset.csv.zip')
df_order_payments = pd.read_csv('data/olist_order_payments_dataset.csv.zip')
df_order_reviews = pd.read_csv('data/olist_order_reviews_dataset.csv.zip')
df_orders = pd.read_csv('data/olist_orders_dataset.csv.zip')
df_products = pd.read_csv('data/olist_products_dataset.csv.zip')
df_sellers <- py$df_sellers
df_customers <- py$df_customers
df_geolocation <- py$df_geolocation
df_order_items <- py$df_order_items
df_order_payments <- py$df_order_payments
df_order_reviews <- py$df_order_reviews
df_orders <- py$df_orders
df_products <- py$df_products
df_customers_r <- readr::read_csv('data/olist_customers_dataset.csv.zip')
r.df_customers_r.head()
## customer_id ... customer_state
## 0 06b8999e2fba1a1fbc88172c00ba8bc7 ... SP
## 1 18955e83d337fd6b2def6b18a428ac77 ... SP
## 2 4e7b3e00288586ebd08712fdd0374a03 ... SP
## 3 b2b6027bc5c5109e529d4dc6358b12c3 ... SP
## 4 4f2d8ab171c80ec8364f7c12e35b23ad ... SP
##
## [5 rows x 5 columns]
df_olist <- df_orders %>%
left_join(df_order_items, by = "order_id") %>%
full_join(df_order_payments, by = "order_id") %>%
full_join(df_order_reviews, by = "order_id") %>%
full_join(df_products, by = "product_id") %>%
full_join(df_customers, by = "customer_id") %>%
full_join(df_sellers, by = "seller_id") %>%
glimpse
## Rows: 119,151
## Columns: 39
## $ order_id <chr> "e481f51cbdc54678b7cc49136f2d6af7", "...
## $ customer_id <chr> "9ef432eb6251297304e76186b10a928d", "...
## $ order_status <chr> "delivered", "delivered", "delivered"...
## $ order_purchase_timestamp <chr> "2017-10-02 10:56:33", "2017-10-02 10...
## $ order_approved_at <list> ["2017-10-02 11:07:15", "2017-10-02 ...
## $ order_delivered_carrier_date <list> ["2017-10-04 19:55:00", "2017-10-04 ...
## $ order_delivered_customer_date <list> ["2017-10-10 21:25:13", "2017-10-10 ...
## $ order_estimated_delivery_date <chr> "2017-10-18 00:00:00", "2017-10-18 00...
## $ order_item_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ product_id <chr> "87285b34884572647811a353c7ac498a", "...
## $ seller_id <chr> "3504c0cb71d7fa48d967e0e4c94d59d9", "...
## $ shipping_limit_date <chr> "2017-10-06 11:07:15", "2017-10-06 11...
## $ price <dbl> 29.99, 29.99, 29.99, 118.70, 159.90, ...
## $ freight_value <dbl> 8.72, 8.72, 8.72, 22.76, 19.22, 27.20...
## $ payment_sequential <dbl> 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
## $ payment_type <chr> "credit_card", "voucher", "voucher", ...
## $ payment_installments <dbl> 1, 1, 1, 1, 3, 1, 1, 6, 1, 3, 1, 1, 1...
## $ payment_value <dbl> 18.12, 2.00, 18.59, 141.46, 179.12, 7...
## $ review_id <chr> "a54f0611adc9ed256b57ede6b6eb5114", "...
## $ review_score <dbl> 4, 4, 4, 4, 5, 5, 5, 4, 2, 5, 1, 5, 5...
## $ review_comment_title <list> [NaN, NaN, NaN, "Muito boa a loja", ...
## $ review_comment_message <list> ["Não testei o produto ainda, mas el...
## $ review_creation_date <chr> "2017-10-11 00:00:00", "2017-10-11 00...
## $ review_answer_timestamp <chr> "2017-10-12 03:43:48", "2017-10-12 03...
## $ product_category_name <list> ["utilidades_domesticas", "utilidade...
## $ product_name_lenght <dbl> 40, 40, 40, 29, 46, 59, 38, 49, NaN, ...
## $ product_description_lenght <dbl> 268, 268, 268, 178, 232, 468, 316, 60...
## $ product_photos_qty <dbl> 4, 4, 4, 1, 1, 3, 4, 1, NaN, 1, 2, 1,...
## $ product_weight_g <dbl> 500, 500, 500, 400, 420, 450, 250, 71...
## $ product_length_cm <dbl> 19, 19, 19, 19, 24, 30, 51, 65, 35, 1...
## $ product_height_cm <dbl> 8, 8, 8, 13, 19, 10, 15, 10, 35, 16, ...
## $ product_width_cm <dbl> 13, 13, 13, 19, 21, 20, 15, 65, 15, 1...
## $ customer_unique_id <chr> "7c396fd4830fd04220f754e42b4e5bff", "...
## $ customer_zip_code_prefix <dbl> 3149, 3149, 3149, 47813, 75265, 59296...
## $ customer_city <chr> "sao paulo", "sao paulo", "sao paulo"...
## $ customer_state <chr> "SP", "SP", "SP", "BA", "GO", "RN", "...
## $ seller_zip_code_prefix <dbl> 9350, 9350, 9350, 31570, 14840, 31842...
## $ seller_city <chr> "maua", "maua", "maua", "belo horizon...
## $ seller_state <chr> "SP", "SP", "SP", "SP", "SP", "MG", "...