library(tidyverse)
## -- Attaching packages ------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
setwd('C:/Users/DellPC/Desktop/Corner/R_source_code/brazilian-ecommerce')

list.files()
##  [1] "back-to-predict-the-future-interactive-m5-eda.Rmd"
##  [2] "brazilian-ecommerce.zip"                          
##  [3] "caret_package.Rmd"                                
##  [4] "geo3.png"                                         
##  [5] "geo4.png"                                         
##  [6] "geo5.png"                                         
##  [7] "geo6.png"                                         
##  [8] "geo7.png"                                         
##  [9] "Olist-in-Makerting-Sense.Rmd"                     
## [10] "Olist in Makerting Sense.Rmd"                     
## [11] "olist.html"                                       
## [12] "olist.Rmd"                                        
## [13] "olist_customers_dataset.csv"                      
## [14] "olist_ecommerce.html"                             
## [15] "olist_ecommerce.Rmd"                              
## [16] "olist_geolocation_dataset.csv"                    
## [17] "olist_order_items_dataset.csv"                    
## [18] "olist_order_payments_dataset.csv"                 
## [19] "olist_order_reviews_dataset.csv"                  
## [20] "olist_orders_dataset.csv"                         
## [21] "olist_products_dataset.csv"                       
## [22] "olist_sellers_dataset.csv"                        
## [23] "product_category_name_translation.csv"            
## [24] "rsconnect"

Read the dataset

cust <- read.csv('olist_customers_dataset.csv')
orders <- read.csv('olist_orders_dataset.csv')
sellers <- read.csv('olist_sellers_dataset.csv')
products <- read.csv('olist_products_dataset.csv')
orderitems <- read.csv('olist_order_items_dataset.csv')
geo <- read.csv('olist_geolocation_dataset.csv')
payments <- read.csv('olist_order_payments_dataset.csv')

nametrans <- read.csv('product_category_name_translation.csv', header = TRUE)

Round the longitude and latitude for 3 number

geo$geolocation_lat <- round(geo$geolocation_lat, 3)
geo$geolocation_lng <- round(geo$geolocation_lng, 3)

Preprocess the longitude and latitude

sell_location <- geo %>% group_by(geolocation_city) %>%
               summarise(sell_lat = max(geolocation_lat),
                         sell_lng = max(geolocation_lng))
## `summarise()` ungrouping output (override with `.groups` argument)
cust_location <- geo %>% group_by(geolocation_city) %>%
               summarise(cust_lat = max(geolocation_lat),
                         cust_lng = max(geolocation_lng))
## `summarise()` ungrouping output (override with `.groups` argument)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
timeframe <- c('order_approved_at', 'order_purchase_timestamp', 'order_delivered_carrier_date',
               'order_delivered_customer_date', 'order_estimated_delivery_date')


orders$order_approved_at <- as.POSIXct(orders$order_approved_at, 
                                       format = '%Y-%m-%d %H:%M:%S')
orders$order_purchase_timestamp <- as.POSIXct(orders$order_purchase_timestamp,
                                              format = '%Y-%m-%d %H:%M:%S')
orders$order_delivered_carrier_date <- as.POSIXct(orders$order_delivered_carrier_date,
                                                  format = '%Y-%m-%d %H:%M:%S')
orders$order_delivered_customer_date <- as.POSIXct(orders$order_delivered_customer_date, 
                                                  format = '%Y-%m-%d %H:%M:%S')
orders$order_estimated_delivery_date <- as.POSIXct(orders$order_estimated_delivery_date,
                                                   format = '%Y-%m-%d %H:%M:%S')


table(orders$order_status)%>% as.data.frame() %>% arrange(desc(Freq))
##          Var1  Freq
## 1   delivered 96478
## 2     shipped  1107
## 3    canceled   625
## 4 unavailable   609
## 5    invoiced   314
## 6  processing   301
## 7     created     5
## 8    approved     2
orderitems$shipping_limit_date <- as.Date(orderitems$shipping_limit_date, 
                                          format = '%Y-%m-%d %H:%M:%S')
MergedData <- full_join(orderitems, sellers, by = 'seller_id')
CustOrd <- full_join(orders, cust, by ='customer_id')
custsellord <- full_join(CustOrd, MergedData, by ='order_id')
custsellordprod <- full_join(custsellord, products, by ='product_id')
complete <- full_join(custsellordprod, payments, by ='order_id')
complete1 <- full_join(complete, sell_location, by = c('seller_city' = 'geolocation_city'))
complete2 <- full_join(complete1, cust_location, by = c('customer_city' = 'geolocation_city')) 

nametrans <- nametrans %>%
               mutate(product_category_name = `ï..product_category_name`)
complete3 <- full_join(complete2, nametrans, by = 'product_category_name')
library(Imap)
## Warning: package 'Imap' was built under R version 4.0.3
## 
## Attaching package: 'Imap'
## The following object is masked from 'package:purrr':
## 
##     imap
dist_list <- list()

for (i in 1:nrow(complete2)) {
               dist_list[[i]] <- gdist(lon.1 = complete2$sell_lng[i],
                                       lat.1 = complete2$sell_lat[i],
                                       lat.2 = complete2$cust_lat[i],
                                       lon.2 = complete2$cust_lng[i],
                                       units = 'miles')
}

head(dist_list)
## [[1]]
## [1] 16.77722
## 
## [[2]]
## [1] 16.77722
## 
## [[3]]
## [1] 16.77722
## 
## [[4]]
## [1] 594.2834
## 
## [[5]]
## [1] 317.5573
## 
## [[6]]
## [1] 1213.525
complete2$distbtwn <- as.integer(dist_list)
NumAttr <- subset(complete2, select = c(distbtwn, price,
                                        freight_value, product_name_lenght, 
                                        product_description_lenght, product_photos_qty,
                                        product_weight_g, payment_value, payment_installments))

library(dplyr)

library(maps)
## Warning: package 'maps' was built under R version 4.0.3
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
Brazil <- map_data('world') %>% filter(
               region == 'Brazil'
)

# Removing some outliers 
# Brazils most Northern spot is at 5 deg 16' 27.8" N latitude.
geo = geo[geo$geolocation_lat <= 5.27438888, ]
#it's most Western spot is at 73 deg, 58' 58.19"W Long
geo = geo[geo$geolocation_lng >= -73.98283055, ]
# It's most southern splot is at 33 deg, 45' 04.21" S latitude.
geo = geo[geo$geolocation_latz >= -33.75116944, ]
#It's most Eastern spot is 34 deg, 47'35.33" W.Long.
geo = geo[geo$geolocation_lng <= -34.79314722, ]
glimpse(geo)
## Rows: 0
## Columns: 5
## $ geolocation_zip_code_prefix <int> 
## $ geolocation_lat             <dbl> 
## $ geolocation_lng             <dbl> 
## $ geolocation_city            <chr> 
## $ geolocation_state           <chr>
complete2 = complete2[complete2$sell_lat <= 5.27438888,]

complete2 = complete2[complete2$cust_lat <= 5.27438888,]

theme_set(theme_light())

ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="black")+
  geom_point(data= complete2,aes(x=sell_lng,y=sell_lat,color=seller_state),size=0.2)
## Warning: Removed 14016 rows containing missing values (geom_point).

ggsave('geo3.png', plot = last_plot())
## Saving 7 x 5 in image
## Warning: Removed 14016 rows containing missing values (geom_point).
ggplot() + geom_polygon(data = Brazil, 
                        aes(x = long, y = lat, group = group), fill = 'black') +
               geom_point(data = complete2, aes(x = cust_lng, y = cust_lat, color = customer_state), size = 0.2)
## Warning: Removed 14016 rows containing missing values (geom_point).

ggsave('geo4.png', plot = last_plot())
## Saving 7 x 5 in image
## Warning: Removed 14016 rows containing missing values (geom_point).
ggplot() + 
               geom_bar(data = complete3, aes(product_category_name_english, fill = seller_state),
                        width = 1) + coord_flip()

ggsave('geo5.png', plot = last_plot())
## Saving 7 x 5 in image
ggplot() + 
               geom_bar(data = complete3, aes(seller_state, fill = seller_state), width = 1) +
               coord_flip()

ggsave('geo6.png', plot = last_plot())
## Saving 7 x 5 in image
library(lubridate)

complete3$order_purchase_timestamp = ymd_hms(complete3$order_purchase_timestamp)
complete3$order_delivered_carrier_date = ymd_hms(complete3$order_delivered_carrier_date)
complete3$order_delivered_customer_date = ymd_hms(complete3$order_delivered_customer_date)
complete3$order_estimated_delivery_date = ymd_hms(complete3$order_estimated_delivery_date)
## Warning: All formats failed to parse. No formats found.
complete3$order_approved_at = ymd_hms(complete3$order_approved_at)

complete3$shipping_limit_date = ymd_hms(complete3$shipping_limit_date)
## Warning: All formats failed to parse. No formats found.
complete3 %>% group_by(seller_state, customer_state) %>%
               summarise(count = n()) %>%
               arrange(desc(count))
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 442 x 3
## # Groups:   seller_state [24]
##    seller_state customer_state count
##    <chr>        <chr>          <int>
##  1 SP           SP             37736
##  2 <NA>         <NA>           11407
##  3 SP           RJ             10231
##  4 SP           MG              9059
##  5 SP           RS              4343
##  6 SP           PR              3825
##  7 PR           SP              3577
##  8 MG           SP              3111
##  9 SP           BA              2885
## 10 SP           SC              2833
## # ... with 432 more rows
complete3 %>% group_by(product_category_name_english) %>%
               summarise(count = n()) %>% arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 72 x 2
##    product_category_name_english count
##    <chr>                         <int>
##  1 <NA>                          13960
##  2 bed_bath_table                11823
##  3 health_beauty                  9975
##  4 sports_leisure                 8945
##  5 furniture_decor                8744
##  6 computers_accessories          8082
##  7 housewares                     7355
##  8 watches_gifts                  6201
##  9 telephony                      4721
## 10 garden_tools                   4574
## # ... with 62 more rows
complete3 %>% 
               filter(seller_state != 'SP') %>%
               filter(product_category_name_english == 'computers') %>%
               group_by(seller_state, seller_city) %>%
               summarise(count = n())
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 5 x 3
## # Groups:   seller_state [3]
##   seller_state seller_city      count
##   <chr>        <chr>            <int>
## 1 BA           guanambi             1
## 2 BA           lauro de freitas   170
## 3 MG           belo horizonte       1
## 4 MG           montes claros        1
## 5 PR           londrina             1
complete3 %>% 
               filter(seller_state != 'SP') %>%
               filter(product_category_name_english == 'arts_and_craftmanship') %>%
               group_by(seller_state, seller_city) %>%
               summarise(count = n())
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 4 x 3
## # Groups:   seller_state [4]
##   seller_state seller_city count
##   <chr>        <chr>       <int>
## 1 MG           sabara          1
## 2 RJ           nova iguacu     1
## 3 RS           lajeado         1
## 4 SC           criciuma        1
complete3 %>% 
               filter(seller_state != 'SP') %>%
               filter(product_category_name_english == 'flowers') %>%
               group_by(seller_state, seller_city) %>%
               summarise(count = n())
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 1 x 3
## # Groups:   seller_state [1]
##   seller_state seller_city  count
##   <chr>        <chr>        <int>
## 1 RJ           belford roxo     1
complete3 %>% 
               filter(seller_state != 'SP') %>%
               filter(product_category_name_english == 'fashion_sport') %>%
               group_by(seller_state, seller_city) %>%
               summarise(count = n())
## `summarise()` regrouping output by 'seller_state' (override with `.groups` argument)
## # A tibble: 3 x 3
## # Groups:   seller_state [3]
##   seller_state seller_city   count
##   <chr>        <chr>         <int>
## 1 MG           tres coracoes     1
## 2 PR           curitiba          1
## 3 SC           timbo             2
complete3 %>%
               group_by(customer_unique_id) %>%
               summarise(count = n()) %>%
               arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 96,097 x 2
##    customer_unique_id               count
##    <chr>                            <int>
##  1 <NA>                             11407
##  2 9a736b248f67d166d2fbb006bcb877c3    75
##  3 6fbc7cdadbb522125f4b27ae9dee4060    38
##  4 f9ae226291893fda10af7965268fb7f6    35
##  5 8af7ac63b2efbcbd88e5b11505e8098a    29
##  6 569aa12b73b5f7edeaa6f2a01603e381    26
##  7 5419a7c9b86a43d8140e2939cd2c2f7e    24
##  8 85963fd37bfd387aa6d915d8a1065486    24
##  9 c8460e4251689ba205045f3ea17884a1    24
## 10 db1af3fd6b23ac3873ef02619d548f9c    24
## # ... with 96,087 more rows
NumAttr <- na.omit(NumAttr)
#cluster analysis 

set.seed(123)
NumAttr$distbtwn <- as.integer(NumAttr$distbtwn)
scaledattr <- scale(NumAttr)
fit <- kmeans(scaledattr, 5)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fit$withinss
## [1]  98349.17 101719.73 176470.02 103206.66 179395.04
fviz_cluster(fit, NumAttr)

wss <- 0

for (i in 1:10) {
               wss[i] <- sum(kmeans(NumAttr, centers =i)$withinss)

}

plot(1:10, wss, type = 'b', 
     xlab = 'Number of Clusters',
     ylab = 'Within groups sum of squares')

ggsave('geo7.png', plot = last_plot())
## Saving 7 x 5 in image
fit <- kmeans(scaledattr, 8)