library(tidyverse)
## -- Attaching packages ------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
setwd('C:/Users/DellPC/Desktop/Corner/R_source_code/brazilian-ecommerce')
list.files()
## [1] "back-to-predict-the-future-interactive-m5-eda.Rmd"
## [2] "brazilian-ecommerce.zip"
## [3] "caret_package.Rmd"
## [4] "geo3.png"
## [5] "geo4.png"
## [6] "geo5.png"
## [7] "geo6.png"
## [8] "geo7.png"
## [9] "Olist-in-Makerting-Sense.Rmd"
## [10] "Olist in Makerting Sense.Rmd"
## [11] "olist.html"
## [12] "olist.Rmd"
## [13] "olist_customers_dataset.csv"
## [14] "olist_ecommerce.html"
## [15] "olist_ecommerce.Rmd"
## [16] "olist_geolocation_dataset.csv"
## [17] "olist_order_items_dataset.csv"
## [18] "olist_order_payments_dataset.csv"
## [19] "olist_order_reviews_dataset.csv"
## [20] "olist_orders_dataset.csv"
## [21] "olist_products_dataset.csv"
## [22] "olist_sellers_dataset.csv"
## [23] "product_category_name_translation.csv"
## [24] "rsconnect"
Read the dataset
cust <- read.csv('olist_customers_dataset.csv')
orders <- read.csv('olist_orders_dataset.csv')
sellers <- read.csv('olist_sellers_dataset.csv')
products <- read.csv('olist_products_dataset.csv')
orderitems <- read.csv('olist_order_items_dataset.csv')
geo <- read.csv('olist_geolocation_dataset.csv')
payments <- read.csv('olist_order_payments_dataset.csv')
nametrans <- read.csv('product_category_name_translation.csv', header = TRUE)
Round the longitude and latitude for 3 number
geo$geolocation_lat <- round(geo$geolocation_lat, 3)
geo$geolocation_lng <- round(geo$geolocation_lng, 3)
Preprocess the longitude and latitude
sell_location <- geo %>% group_by(geolocation_city) %>%
summarise(sell_lat = max(geolocation_lat),
sell_lng = max(geolocation_lng))
## `summarise()` ungrouping output (override with `.groups` argument)
cust_location <- geo %>% group_by(geolocation_city) %>%
summarise(cust_lat = max(geolocation_lat),
cust_lng = max(geolocation_lng))
## `summarise()` ungrouping output (override with `.groups` argument)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
timeframe <- c('order_approved_at', 'order_purchase_timestamp', 'order_delivered_carrier_date',
'order_delivered_customer_date', 'order_estimated_delivery_date')
orders$order_approved_at <- as.POSIXct(orders$order_approved_at,
format = '%Y-%m-%d %H:%M:%S')
orders$order_purchase_timestamp <- as.POSIXct(orders$order_purchase_timestamp,
format = '%Y-%m-%d %H:%M:%S')
orders$order_delivered_carrier_date <- as.POSIXct(orders$order_delivered_carrier_date,
format = '%Y-%m-%d %H:%M:%S')
orders$order_delivered_customer_date <- as.POSIXct(orders$order_delivered_customer_date,
format = '%Y-%m-%d %H:%M:%S')
orders$order_estimated_delivery_date <- as.POSIXct(orders$order_estimated_delivery_date,
format = '%Y-%m-%d %H:%M:%S')
table(orders$order_status)%>% as.data.frame() %>% arrange(desc(Freq))
## Var1 Freq
## 1 delivered 96478
## 2 shipped 1107
## 3 canceled 625
## 4 unavailable 609
## 5 invoiced 314
## 6 processing 301
## 7 created 5
## 8 approved 2
orderitems$shipping_limit_date <- as.Date(orderitems$shipping_limit_date,
format = '%Y-%m-%d %H:%M:%S')
MergedData <- full_join(orderitems, sellers, by = 'seller_id')
CustOrd <- full_join(orders, cust, by ='customer_id')
custsellord <- full_join(CustOrd, MergedData, by ='order_id')
custsellordprod <- full_join(custsellord, products, by ='product_id')
complete <- full_join(custsellordprod, payments, by ='order_id')
complete1 <- full_join(complete, sell_location, by = c('seller_city' = 'geolocation_city'))
complete2 <- full_join(complete1, cust_location, by = c('customer_city' = 'geolocation_city'))
nametrans <- nametrans %>%
mutate(product_category_name = `ï..product_category_name`)
complete3 <- full_join(complete2, nametrans, by = 'product_category_name')
library(Imap)
## Warning: package 'Imap' was built under R version 4.0.3
##
## Attaching package: 'Imap'
## The following object is masked from 'package:purrr':
##
## imap
dist_list <- list()
for (i in 1:nrow(complete2)) {
dist_list[[i]] <- gdist(lon.1 = complete2$sell_lng[i],
lat.1 = complete2$sell_lat[i],
lat.2 = complete2$cust_lat[i],
lon.2 = complete2$cust_lng[i],
units = 'miles')
}
head(dist_list)
## [[1]]
## [1] 16.77722
##
## [[2]]
## [1] 16.77722
##
## [[3]]
## [1] 16.77722
##
## [[4]]
## [1] 594.2834
##
## [[5]]
## [1] 317.5573
##
## [[6]]
## [1] 1213.525
complete2$distbtwn <- as.integer(dist_list)
NumAttr <- subset(complete2, select = c(distbtwn, price,
freight_value, product_name_lenght,
product_description_lenght, product_photos_qty,
product_weight_g, payment_value, payment_installments))
library(dplyr)
library(maps)
## Warning: package 'maps' was built under R version 4.0.3
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
Brazil <- map_data('world') %>% filter(
region == 'Brazil'
)
# Removing some outliers
# Brazils most Northern spot is at 5 deg 16' 27.8" N latitude.
geo = geo[geo$geolocation_lat <= 5.27438888, ]
#it's most Western spot is at 73 deg, 58' 58.19"W Long
geo = geo[geo$geolocation_lng >= -73.98283055, ]
# It's most southern splot is at 33 deg, 45' 04.21" S latitude.
geo = geo[geo$geolocation_latz >= -33.75116944, ]
#It's most Eastern spot is 34 deg, 47'35.33" W.Long.
geo = geo[geo$geolocation_lng <= -34.79314722, ]
glimpse(geo)
## Rows: 0
## Columns: 5
## $ geolocation_zip_code_prefix <int>
## $ geolocation_lat <dbl>
## $ geolocation_lng <dbl>
## $ geolocation_city <chr>
## $ geolocation_state <chr>
complete2 = complete2[complete2$sell_lat <= 5.27438888,]
complete2 = complete2[complete2$cust_lat <= 5.27438888,]
theme_set(theme_light())
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="black")+
geom_point(data= complete2,aes(x=sell_lng,y=sell_lat,color=seller_state),size=0.2)
## Warning: Removed 14016 rows containing missing values (geom_point).
ggsave('geo3.png', plot = last_plot())
## Saving 7 x 5 in image
## Warning: Removed 14016 rows containing missing values (geom_point).
ggplot() + geom_polygon(data = Brazil,
aes(x = long, y = lat, group = group), fill = 'black') +
geom_point(data = complete2, aes(x = cust_lng, y = cust_lat, color = customer_state), size = 0.2)
## Warning: Removed 14016 rows containing missing values (geom_point).
ggsave('geo4.png', plot = last_plot())
## Saving 7 x 5 in image
## Warning: Removed 14016 rows containing missing values (geom_point).
ggplot() +
geom_bar(data = complete3, aes(product_category_name_english, fill = seller_state),
width = 1) + coord_flip()
ggsave('geo5.png', plot = last_plot())
## Saving 7 x 5 in image
ggplot() +
geom_bar(data = complete3, aes(seller_state, fill = seller_state), width = 1) +
coord_flip()
ggsave('geo6.png', plot = last_plot())
## Saving 7 x 5 in image
library(lubridate)
complete3$order_purchase_timestamp = ymd_hms(complete3$order_purchase_timestamp)
complete3$order_delivered_carrier_date = ymd_hms(complete3$order_delivered_carrier_date)
complete3$order_delivered_customer_date = ymd_hms(complete3$order_delivered_customer_date)
complete3$order_estimated_delivery_date = ymd_hms(complete3$order_estimated_delivery_date)
## Warning: All formats failed to parse. No formats found.
complete3$order_approved_at = ymd_hms(complete3$order_approved_at)
complete3$shipping_limit_date = ymd_hms(complete3$shipping_limit_date)
## Warning: All formats failed to parse. No formats found.
complete3 %>% group_by(seller_state, customer_state) %>%
summarise(count = n()) %>%
arrange(desc(count))
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 442 x 3
## # Groups: seller_state [24]
## seller_state customer_state count
## <chr> <chr> <int>
## 1 SP SP 37736
## 2 <NA> <NA> 11407
## 3 SP RJ 10231
## 4 SP MG 9059
## 5 SP RS 4343
## 6 SP PR 3825
## 7 PR SP 3577
## 8 MG SP 3111
## 9 SP BA 2885
## 10 SP SC 2833
## # ... with 432 more rows
complete3 %>% group_by(product_category_name_english) %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 72 x 2
## product_category_name_english count
## <chr> <int>
## 1 <NA> 13960
## 2 bed_bath_table 11823
## 3 health_beauty 9975
## 4 sports_leisure 8945
## 5 furniture_decor 8744
## 6 computers_accessories 8082
## 7 housewares 7355
## 8 watches_gifts 6201
## 9 telephony 4721
## 10 garden_tools 4574
## # ... with 62 more rows
complete3 %>%
filter(seller_state != 'SP') %>%
filter(product_category_name_english == 'computers') %>%
group_by(seller_state, seller_city) %>%
summarise(count = n())
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 5 x 3
## # Groups: seller_state [3]
## seller_state seller_city count
## <chr> <chr> <int>
## 1 BA guanambi 1
## 2 BA lauro de freitas 170
## 3 MG belo horizonte 1
## 4 MG montes claros 1
## 5 PR londrina 1
complete3 %>%
filter(seller_state != 'SP') %>%
filter(product_category_name_english == 'arts_and_craftmanship') %>%
group_by(seller_state, seller_city) %>%
summarise(count = n())
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 4 x 3
## # Groups: seller_state [4]
## seller_state seller_city count
## <chr> <chr> <int>
## 1 MG sabara 1
## 2 RJ nova iguacu 1
## 3 RS lajeado 1
## 4 SC criciuma 1
complete3 %>%
filter(seller_state != 'SP') %>%
filter(product_category_name_english == 'flowers') %>%
group_by(seller_state, seller_city) %>%
summarise(count = n())
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 1 x 3
## # Groups: seller_state [1]
## seller_state seller_city count
## <chr> <chr> <int>
## 1 RJ belford roxo 1
complete3 %>%
filter(seller_state != 'SP') %>%
filter(product_category_name_english == 'fashion_sport') %>%
group_by(seller_state, seller_city) %>%
summarise(count = n())
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 3 x 3
## # Groups: seller_state [3]
## seller_state seller_city count
## <chr> <chr> <int>
## 1 MG tres coracoes 1
## 2 PR curitiba 1
## 3 SC timbo 2
complete3 %>%
group_by(customer_unique_id) %>%
summarise(count = n()) %>%
arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 96,097 x 2
## customer_unique_id count
## <chr> <int>
## 1 <NA> 11407
## 2 9a736b248f67d166d2fbb006bcb877c3 75
## 3 6fbc7cdadbb522125f4b27ae9dee4060 38
## 4 f9ae226291893fda10af7965268fb7f6 35
## 5 8af7ac63b2efbcbd88e5b11505e8098a 29
## 6 569aa12b73b5f7edeaa6f2a01603e381 26
## 7 5419a7c9b86a43d8140e2939cd2c2f7e 24
## 8 85963fd37bfd387aa6d915d8a1065486 24
## 9 c8460e4251689ba205045f3ea17884a1 24
## 10 db1af3fd6b23ac3873ef02619d548f9c 24
## # ... with 96,087 more rows
NumAttr <- na.omit(NumAttr)
#cluster analysis
set.seed(123)
NumAttr$distbtwn <- as.integer(NumAttr$distbtwn)
scaledattr <- scale(NumAttr)
fit <- kmeans(scaledattr, 5)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fit$withinss
## [1] 98349.17 101719.73 176470.02 103206.66 179395.04
fviz_cluster(fit, NumAttr)
wss <- 0
for (i in 1:10) {
wss[i] <- sum(kmeans(NumAttr, centers =i)$withinss)
}
plot(1:10, wss, type = 'b',
xlab = 'Number of Clusters',
ylab = 'Within groups sum of squares')
ggsave('geo7.png', plot = last_plot())
## Saving 7 x 5 in image
fit <- kmeans(scaledattr, 8)