Este script muestra un análisis de las Productos, Categorias y Vendedores de brazilian-ecommerce

Se exploran las estructuras de los archivos de

“olist_products_dataset.csv”

“product_category_name_translation.csv”,

“olist_sellers_dataset.csv”

Las librerías

  library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
  library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
  library(RSQLite)
  library(proto)
  library(gsubfn)
  library(readr)

Explorar productos por medio de la carga del archivo “olist_products_dataset.csv”

productos <- read.csv("olist_products_dataset.csv", 
                                       header = TRUE, sep = ",", 
                                       stringsAsFactors = TRUE)

head(productos) # 
##                         product_id product_category_name
## 1 1e9e8ef04dbcff4541ed26657ea517e5            perfumaria
## 2 3aa071139cb16b67ca9e5dea641aaa2f                 artes
## 3 96bd76ec8810374ed1b65e291975717f         esporte_lazer
## 4 cef67bcfe19066a932b7673e239eb23d                 bebes
## 5 9dc1a7de274444849c219cff195d0b71 utilidades_domesticas
## 6 41d3672d4792049fa1779bb35283ed13 instrumentos_musicais
##   product_name_lenght product_description_lenght product_photos_qty
## 1                  40                        287                  1
## 2                  44                        276                  1
## 3                  46                        250                  1
## 4                  27                        261                  1
## 5                  37                        402                  4
## 6                  60                        745                  1
##   product_weight_g product_length_cm product_height_cm product_width_cm
## 1              225                16                10               14
## 2             1000                30                18               20
## 3              154                18                 9               15
## 4              371                26                 4               26
## 5              625                20                17               13
## 6              200                38                 5               11
str(productos) # La estructura
## 'data.frame':    32951 obs. of  9 variables:
##  $ product_id                : Factor w/ 32951 levels "00066f42aeeb9f3007548bb9d3f33c38",..: 4054 7646 19352 26673 20306 8572 14931 4942 7250 18075 ...
##  $ product_category_name     : Factor w/ 74 levels "","agro_industria_e_comercio",..: 64 5 34 11 74 47 28 56 30 14 ...
##  $ product_name_lenght       : int  40 44 46 27 37 60 56 56 57 36 ...
##  $ product_description_lenght: int  287 276 250 261 402 745 1272 184 163 1156 ...
##  $ product_photos_qty        : int  1 1 1 1 4 1 4 2 1 1 ...
##  $ product_weight_g          : int  225 1000 154 371 625 200 18350 900 400 600 ...
##  $ product_length_cm         : int  16 30 18 26 20 38 70 40 27 17 ...
##  $ product_height_cm         : int  10 18 9 4 17 5 24 8 13 10 ...
##  $ product_width_cm          : int  14 20 15 26 13 11 44 40 17 12 ...
summary(productos)
##                             product_id              product_category_name
##  00066f42aeeb9f3007548bb9d3f33c38:    1   cama_mesa_banho      : 3029    
##  00088930e925c41fd95ebfe695fd2655:    1   esporte_lazer        : 2867    
##  0009406fd7479715e4bef61dd91f2462:    1   moveis_decoracao     : 2657    
##  000b8f95fcb9e0096488278317764d19:    1   beleza_saude         : 2444    
##  000d9be29b5207b54e86aa1b1ac54872:    1   utilidades_domesticas: 2335    
##  0011c512eb256aa0dbbb544d8dffcf6e:    1   automotivo           : 1900    
##  (Other)                         :32945   (Other)              :17719    
##  product_name_lenght product_description_lenght product_photos_qty
##  Min.   : 5.00       Min.   :   4.0             Min.   : 1.000    
##  1st Qu.:42.00       1st Qu.: 339.0             1st Qu.: 1.000    
##  Median :51.00       Median : 595.0             Median : 1.000    
##  Mean   :48.48       Mean   : 771.5             Mean   : 2.189    
##  3rd Qu.:57.00       3rd Qu.: 972.0             3rd Qu.: 3.000    
##  Max.   :76.00       Max.   :3992.0             Max.   :20.000    
##  NA's   :610         NA's   :610                NA's   :610       
##  product_weight_g product_length_cm product_height_cm product_width_cm
##  Min.   :    0    Min.   :  7.00    Min.   :  2.00    Min.   :  6.0   
##  1st Qu.:  300    1st Qu.: 18.00    1st Qu.:  8.00    1st Qu.: 15.0   
##  Median :  700    Median : 25.00    Median : 13.00    Median : 20.0   
##  Mean   : 2276    Mean   : 30.82    Mean   : 16.94    Mean   : 23.2   
##  3rd Qu.: 1900    3rd Qu.: 38.00    3rd Qu.: 21.00    3rd Qu.: 30.0   
##  Max.   :40425    Max.   :105.00    Max.   :105.00    Max.   :118.0   
##  NA's   :2        NA's   :2         NA's   :2         NA's   :2
frecuencias <- data.frame(table(productos$product_category_name))
frecuencias <- cbind(frecuencias, frecuencias$Freq / sum(frecuencias$Freq) * 100)
colnames(frecuencias) <- c("Estado", "Frecuencia","FrecPorcentual")
frecuencias <- arrange(frecuencias, desc(Frecuencia))
head(frecuencias)
##                  Estado Frecuencia FrecPorcentual
## 1       cama_mesa_banho       3029       9.192437
## 2         esporte_lazer       2867       8.700798
## 3      moveis_decoracao       2657       8.063488
## 4          beleza_saude       2444       7.417074
## 5 utilidades_domesticas       2335       7.086280
## 6            automotivo       1900       5.766138
tail(frecuencias)
##                           Estado Frecuencia FrecPorcentual
## 69      tablets_impressao_imagem          9    0.027313283
## 70               casa_conforto_2          5    0.015174046
## 71 fashion_roupa_infanto_juvenil          5    0.015174046
## 72                      pc_gamer          3    0.009104428
## 73            seguros_e_servicos          2    0.006069619
## 74             cds_dvds_musicais          1    0.003034809

Explorar categorias por medio de la carga del archivo “”product_category_name_translation.csv“”

categorias <- read.csv("product_category_name_translation.csv", 
                                       header = TRUE, sep = ",", 
                                       stringsAsFactors = TRUE)

head(categorias) # 
##   ï..product_category_name product_category_name_english
## 1             beleza_saude                 health_beauty
## 2   informatica_acessorios         computers_accessories
## 3               automotivo                          auto
## 4          cama_mesa_banho                bed_bath_table
## 5         moveis_decoracao               furniture_decor
## 6            esporte_lazer                sports_leisure
str(categorias) # La estructura
## 'data.frame':    71 obs. of  2 variables:
##  $ ï..product_category_name     : Factor w/ 71 levels "agro_industria_e_comercio",..: 12 45 9 14 55 33 62 71 69 65 ...
##  $ product_category_name_english: Factor w/ 71 levels "agro_industry_and_commerce",..: 44 16 6 8 40 66 60 50 69 71 ...
summary(categorias)
##               ï..product_category_name
##  agro_industria_e_comercio: 1         
##  alimentos                : 1         
##  alimentos_bebidas        : 1         
##  artes                    : 1         
##  artes_e_artesanato       : 1         
##  artigos_de_festas        : 1         
##  (Other)                  :65         
##             product_category_name_english
##  agro_industry_and_commerce: 1           
##  air_conditioning          : 1           
##  art                       : 1           
##  arts_and_craftmanship     : 1           
##  audio                     : 1           
##  auto                      : 1           
##  (Other)                   :65

Explorar vendedores por medio de la carga del archivo “olist_sellers_dataset.csv”

vendedores <- read.csv("olist_sellers_dataset.csv", 
                                       header = TRUE, sep = ",", 
                                       stringsAsFactors = TRUE)

head(vendedores) # 
##                          seller_id seller_zip_code_prefix
## 1 3442f8959a84dea7ee197c632cb2df15                  13023
## 2 d1b65fc7debc3361ea86b5f14c68d2e2                  13844
## 3 ce3ad9de960102d0677a81f5d0bb7b2d                  20031
## 4 c0f3eea2e14555b6faeea3dd58c1b1c3                   4195
## 5 51a04a8a6bdcb23deccc82b0b80742cf                  12914
## 6 c240c4061717ac1806ae6ee72be3533b                  20920
##         seller_city seller_state
## 1          campinas           SP
## 2        mogi guacu           SP
## 3    rio de janeiro           RJ
## 4         sao paulo           SP
## 5 braganca paulista           SP
## 6    rio de janeiro           RJ
str(vendedores) # La estructura
## 'data.frame':    3095 obs. of  4 variables:
##  $ seller_id             : Factor w/ 3095 levels "0015a82c2db000af6aaaf3ae2ecb0532",..: 623 2541 2506 2326 982 2343 2749 322 1458 2490 ...
##  $ seller_zip_code_prefix: int  13023 13844 20031 4195 12914 20920 55325 16304 1529 80310 ...
##  $ seller_city           : Factor w/ 611 levels "04482255","abadia de goias",..: 102 343 451 519 81 451 84 403 519 160 ...
##  $ seller_state          : Factor w/ 23 levels "AC","AM","BA",..: 23 23 17 23 23 17 14 23 23 16 ...
summary(vendedores)
##                             seller_id    seller_zip_code_prefix
##  0015a82c2db000af6aaaf3ae2ecb0532:   1   Min.   : 1001         
##  001cca7ae9ae17fb1caed9dfb1094831:   1   1st Qu.: 7094         
##  001e6ad469a905060d959994f1b41e4f:   1   Median :14940         
##  002100f778ceb8431b7a1020ff7ab48f:   1   Mean   :32291         
##  003554e2dce176b5555353e4f3555ac8:   1   3rd Qu.:64553         
##  004c9cd9d87a3c30c522c48c4fc07416:   1   Max.   :99730         
##  (Other)                         :3089                         
##          seller_city    seller_state 
##  sao paulo     : 694   SP     :1849  
##  curitiba      : 127   PR     : 349  
##  rio de janeiro:  96   MG     : 244  
##  belo horizonte:  68   SC     : 190  
##  ribeirao preto:  52   RJ     : 171  
##  guarulhos     :  50   RS     : 129  
##  (Other)       :2008   (Other): 163