library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(RSQLite)
library(proto)
library(gsubfn)
library(readr)
# Mapas
library(leaflet)
library(knitr)
library(dplyr)
dir()
## [1] "explorar y analizar archivos brazilian_ecoomerce.R"
## [2] "explorar y analizar datos brazilian ecommerce.Rmd"
## [3] "explorar_y_analizar_datos_brazilian_ecommerce.html"
## [4] "explorar_y_analizar_datos_brazilian_ecommerce.Rmd"
## [5] "olist_customers_dataset.csv"
## [6] "olist_geolocation_dataset.csv"
## [7] "olist_order_items_dataset.csv"
## [8] "olist_order_payments_dataset.csv"
## [9] "olist_order_reviews_dataset.csv"
## [10] "olist_orders_dataset.csv"
## [11] "olist_products_dataset.csv"
## [12] "olist_sellers_dataset.csv"
## [13] "product_category_name_translation.csv"
## [14] "rsconnect"
En la siguiente sección, mediante código R, se cargan los datos del archivo olist_customers_dataset.csv, se muestran los primeros 20 registros y se explora la estructrua de los campos del archivo. Al archivo cargado, se identifica en R, con una variable tipo data.frame llamad clientes.
clientes <- read.csv("olist_customers_dataset.csv",
header = TRUE, sep = ",",
stringsAsFactors = TRUE)
clientes[1:20,] # Solo los primeros 20 registros
## customer_id customer_unique_id
## 1 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0
## 2 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3
## 3 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e
## 4 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c
## 5 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066
## 6 879864dab9bc3047522c92c82e1212b8 4c93744516667ad3b8f1fb645a3116a4
## 7 fd826e7cf63160e536e0908c76c3f441 addec96d2e059c80c30fe6871d30d177
## 8 5e274e7a0c3809e14aba7ad5aae0d407 57b2a98a409812fe9618067b6b8ebe4f
## 9 5adf08e34b2e993982a47070956c5c65 1175e95fb47ddff9de6b2b06188f7e0d
## 10 4b7139f34592b3a31687243a302fa75b 9afe194fb833f79e300e37e580171f22
## 11 9fb35e4ed6f0a14a4977cd9aea4042bb 2a7745e1ed516b289ed9b29c7d0539a5
## 12 5aa9e4fdd4dfd20959cad2d772509598 2a46fb94aef5cbeeb850418118cee090
## 13 b2d1536598b73a9abd18e0d75d92f0a3 918dc87cd72cd9f6ed4bd442ed785235
## 14 eabebad39a88bb6f5b52376faec28612 295c05e81917928d76245e842748184d
## 15 1f1c7bf1c9b041b292af6c1c4470b753 3151a81801c8386361b62277d7fa5ecf
## 16 206f3129c0e4d7d0b9550426023f0a08 21f748a16f4e1688a9014eb3ee6fa325
## 17 a7c125a0a07b75146167b7f04a7f8e98 5c2991dbd08bbf3cf410713c4de5a0b5
## 18 c5c61596a3b6bd0cee5766992c48a9a1 b6e99561fe6f34a55b0b7da92f8ed775
## 19 9b8ce803689b3562defaad4613ef426f 7f3a72e8f988c6e735ba118d54f47458
## 20 49d0ea0986edde72da777f15456a0ee0 3e6fd6b2f0d499456a6a6820a40f2d79
## customer_zip_code_prefix customer_city customer_state
## 1 14409 franca SP
## 2 9790 sao bernardo do campo SP
## 3 1151 sao paulo SP
## 4 8775 mogi das cruzes SP
## 5 13056 campinas SP
## 6 89254 jaragua do sul SC
## 7 4534 sao paulo SP
## 8 35182 timoteo MG
## 9 81560 curitiba PR
## 10 30575 belo horizonte MG
## 11 39400 montes claros MG
## 12 20231 rio de janeiro RJ
## 13 18682 lencois paulista SP
## 14 5704 sao paulo SP
## 15 95110 caxias do sul RS
## 16 13412 piracicaba SP
## 17 22750 rio de janeiro RJ
## 18 7124 guarulhos SP
## 19 5416 sao paulo SP
## 20 68485 pacaja PA
clientes$customer_zip_code_prefix <- as.factor(clientes$customer_zip_code_prefix) # Lo convertimos a factor
str(clientes)
## 'data.frame': 99441 obs. of 5 variables:
## $ customer_id : Factor w/ 99441 levels "00012a2ce6f8dcda20d059ce98491703",..: 2611 9562 30461 69606 30708 52562 98443 36424 35167 29245 ...
## $ customer_unique_id : Factor w/ 96096 levels "0000366f3b9a7992bf8c76cfdf3221e2",..: 50397 15434 2273 14193 19734 28806 65370 33034 6612 58226 ...
## $ customer_zip_code_prefix: Factor w/ 14994 levels "1003","1004",..: 4774 3802 68 3586 4307 13965 1870 7478 12809 7110 ...
## $ customer_city : Factor w/ 4119 levels "abadia dos dourados",..: 1383 3429 3598 2344 708 1937 3598 3866 1144 454 ...
## $ customer_state : Factor w/ 27 levels "AC","AL","AM",..: 26 26 26 26 26 24 26 11 18 11 ...
summary(clientes)
## customer_id
## 00012a2ce6f8dcda20d059ce98491703: 1
## 000161a058600d5901f007fab4c27140: 1
## 0001fd6190edaaf884bcaf3d49edf079: 1
## 0002414f95344307404f0ace7a26f1d5: 1
## 000379cdec625522490c315e70c7a9fb: 1
## 0004164d20a9e969af783496f3408652: 1
## (Other) :99435
## customer_unique_id customer_zip_code_prefix
## 8d50f5eadf50201ccdcedfb9e2ac8455: 17 22790 : 142
## 3e43e6105506432c953e165fb2acf44c: 9 24220 : 124
## 1b6c7548a2a1f9037c1fd3ddfed95f33: 7 22793 : 121
## 6469f99c1f9dfae7733b25662e7f1782: 7 24230 : 117
## ca77025e7201e3b30c44b472ff346268: 7 22775 : 110
## 12f5d6e1cbf93dafd9dcc19095df0b3d: 6 29101 : 101
## (Other) :99388 (Other):98726
## customer_city customer_state
## sao paulo :15540 SP :41746
## rio de janeiro: 6882 RJ :12852
## belo horizonte: 2773 MG :11635
## brasilia : 2131 RS : 5466
## curitiba : 1521 PR : 5045
## campinas : 1444 SC : 3637
## (Other) :69150 (Other):19060
En la siguiente sección, mediante código R, se cargan los datos del archivo olist_geolocation_dataset.csv, se muestran los primeros 20 registros y se explora la estructrua de los campos del archivo. Al archivo cargado, se identifica en R, con una variable tipo data.frame llamada geolocaliacion.
geolocalizacion <- read.csv("olist_geolocation_dataset.csv",
header = TRUE, sep = ",",
stringsAsFactors = TRUE)
geolocalizacion[1:20,] # Solo los primeros 20 registros
## geolocation_zip_code_prefix geolocation_lat geolocation_lng
## 1 1037 -23.54562 -46.63929
## 2 1046 -23.54608 -46.64482
## 3 1046 -23.54613 -46.64295
## 4 1041 -23.54439 -46.63950
## 5 1035 -23.54158 -46.64161
## 6 1012 -23.54776 -46.63536
## 7 1047 -23.54627 -46.64123
## 8 1013 -23.54692 -46.63426
## 9 1029 -23.54377 -46.63428
## 10 1011 -23.54764 -46.63603
## 11 1013 -23.54733 -46.63418
## 12 1032 -23.53842 -46.63478
## 13 1014 -23.54644 -46.63383
## 14 1012 -23.54895 -46.63467
## 15 1037 -23.54519 -46.63786
## 16 1046 -23.54608 -46.64482
## 17 1039 -23.54188 -46.63992
## 18 1024 -23.54139 -46.62990
## 19 1009 -23.54694 -46.63659
## 20 1046 -23.54588 -46.64316
## geolocation_city geolocation_state
## 1 sao paulo SP
## 2 sao paulo SP
## 3 sao paulo SP
## 4 sao paulo SP
## 5 sao paulo SP
## 6 são paulo SP
## 7 sao paulo SP
## 8 sao paulo SP
## 9 sao paulo SP
## 10 sao paulo SP
## 11 sao paulo SP
## 12 sao paulo SP
## 13 sao paulo SP
## 14 sao paulo SP
## 15 são paulo SP
## 16 sao paulo SP
## 17 sao paulo SP
## 18 são paulo SP
## 19 sao paulo SP
## 20 sao paulo SP
# Lo convertimos a factor
str(geolocalizacion)
## 'data.frame': 1000163 obs. of 5 variables:
## $ geolocation_zip_code_prefix: int 1037 1046 1046 1041 1035 1012 1047 1013 1029 1011 ...
## $ geolocation_lat : num -23.5 -23.5 -23.5 -23.5 -23.5 ...
## $ geolocation_lng : num -46.6 -46.6 -46.6 -46.6 -46.6 ...
## $ geolocation_city : Factor w/ 8011 levels "* cidade","...arraial do cabo",..: 7029 7029 7029 7029 7029 6249 7029 7029 7029 7029 ...
## $ geolocation_state : Factor w/ 27 levels "AC","AL","AM",..: 26 26 26 26 26 26 26 26 26 26 ...
summary(geolocalizacion)
## geolocation_zip_code_prefix geolocation_lat geolocation_lng
## Min. : 1001 Min. :-36.61 Min. :-101.47
## 1st Qu.:11075 1st Qu.:-23.60 1st Qu.: -48.57
## Median :26530 Median :-22.92 Median : -46.64
## Mean :36574 Mean :-21.18 Mean : -46.39
## 3rd Qu.:63504 3rd Qu.:-19.98 3rd Qu.: -43.77
## Max. :99990 Max. : 45.07 Max. : 121.11
##
## geolocation_city geolocation_state
## sao paulo :135800 SP :404268
## rio de janeiro: 62151 MG :126336
## belo horizonte: 27805 RJ :121169
## são paulo : 24918 RS : 61851
## curitiba : 16593 PR : 57859
## porto alegre : 13521 SC : 38328
## (Other) :719375 (Other):190352
En la siguiente sección se muestra una análisis de datos por medio de un mapa geográfico de las zonas postales de los clientes según su zona postal.
zonaspostales <- as.factor(clientes$customer_zip_code_prefix)
# Una tabla de Freq...
frecuencia <- data.frame(table(zonaspostales))
# Ordenar por la frecuencia y los primeros 10 mas frecuentes
masFrecuentesZP <- head(arrange(frecuencia, desc(Freq)), 10)
# Generar mediante las siguientes lineas de código una
# data frame con los clientes mas frecuentes
datosGeo <- filter(geolocalizacion, geolocation_zip_code_prefix %in% masFrecuentesZP$zonaspostales)
datosGeoUnicos <- sqldf("SELECT distinct(geolocation_zip_code_prefix), geolocation_lat, geolocation_lng, geolocation_city, geolocation_state
FROM datosGeo group by geolocation_zip_code_prefix" )
datosGeoFinal <- data.frame(arrange(masFrecuentesZP, zonaspostales), datosGeoUnicos)
datosGeoFinal <- arrange(datosGeoFinal, desc(Freq))
# Los diez registros con clientes mas frecuentes
datosGeoFinal
## zonaspostales Freq geolocation_zip_code_prefix geolocation_lat
## 1 22790 142 22790 -23.01895
## 2 24220 124 24220 -22.90329
## 3 22793 121 22793 -22.99474
## 4 24230 117 24230 -22.90759
## 5 22775 110 22775 -22.97295
## 6 29101 101 29101 -20.33034
## 7 13212 95 13212 -23.14606
## 8 35162 93 35162 -19.45732
## 9 22631 89 22631 -23.00231
## 10 38400 87 38400 -18.92096
## geolocation_lng geolocation_city geolocation_state
## 1 -43.47470 rio de janeiro RJ
## 2 -43.11363 niteroi RJ
## 3 -43.40600 rio de janeiro RJ
## 4 -43.11005 niteroi RJ
## 5 -43.39239 rio de janeiro RJ
## 6 -40.28343 vila velha ES
## 7 -46.99535 jundiai SP
## 8 -42.56904 ipatinga MG
## 9 -43.35139 rio de janeiro RJ
## 10 -48.29774 uberlândia MG
El mapa se puede agrandar y disminuir para visulizar las leyendas de cada popup
mapa<-leaflet() %>%
addTiles() %>%
addMarkers(data = datosGeoFinal, lng = ~ geolocation_lng, lat = ~ geolocation_lat,
popup = paste(datosGeoFinal$zonaspostales,
datosGeoFinal$geolocation_city,
datosGeoFinal$geolocation_state,
"Clientes:",
datosGeoFinal$Freq, sep = ", ")
)
mapa