1. Introduction

In this project we will study the Brazillian E-commerce data.After that we will perform some data preprocessing , Exploratory Data Analysis, and Predictive Analyses Using Machine Learning Models .

2. Data Curation

2.1 Data description

  • Customers Data: This dataset has information about the customer and its location. Each order is assigned to a unique customerid. This means that the same customer will get different ids for different orders. The purpose of having a customerunique_id on the dataset is to allow you to identify customers that made repurchases at the store.

  • Geolocation Data: This dataset has information Brazilian zip codes and its lat/lng coordinates. Use it to plot maps and find distances between sellers and customers.

  • Order Items Dataset: This dataset includes data about the items purchased within each order

  • Order Paymets Dataset: This dataset includes data about the orders payment options.

  • Order Reviews Dataset: This dataset includes data about the reviews made by the customers.

  • Order Dataset: This is the core dataset. From each order you might find all other information.

  • Product Dataset: This dataset includes data about the products sold by Olist.

  • Sellers Dataset: This dataset includes data about the sellers that fulfilled orders made at Olist.

  • Use it to find the seller location and to identify which seller fulfilled each product.

  • Product Category Name translation: Translates the productcategoryname to english.

2.2 Data Source

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(knitr)
library(lattice)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.8     ✔ purrr   0.3.4
## ✔ tidyr   1.1.4     ✔ stringr 1.5.0
## ✔ readr   2.1.1     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ lubridate::union()       masks base::union()
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: '/opt/X11/lib/libSM.6.dylib'
##   Referenced from: '/Library/Frameworks/R.framework/Versions/4.1/Resources/modules/R_X11.so'
##   Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk1.8.0_241.jdk/Contents/Home/jre/lib/server/libSM.6.dylib' (no such file)
## Could not load tcltk.  Will use slower R code instead.
## Loading required package: RSQLite
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(ggrepel)
library(geodata)
## Loading required package: terra
## terra 1.5.21
## 
## Attaching package: 'terra'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:knitr':
## 
##     spin
## The following object is masked from 'package:ggplot2':
## 
##     arrow
## The following object is masked from 'package:dplyr':
## 
##     src
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
# read data
review<-read.csv("olist_order_reviews_dataset.csv", header = TRUE)
Orders<-read.csv("olist_orders_dataset.csv", header = TRUE)
products<-read.csv("olist_products_dataset.csv", header = TRUE)
sellers<-read.csv("olist_sellers_dataset.csv", header = TRUE)
customer<-read.csv("olist_customers_dataset.csv", header = TRUE)
geolocation<-read.csv("olist_geolocation_dataset.csv", header = TRUE)
payments<-read.csv("olist_order_payments_dataset.csv", header = TRUE)
Items<-read.csv("olist_order_items_dataset.csv", header = TRUE)
nametrans<-read.csv("product_category_name_translation.csv", header = TRUE)

3. Data preprocessing

3.1 Join Operation

c<-left_join(customer, Orders, by='customer_id')
x<-left_join(Items, products, by='product_id')
s<-left_join(x, sellers, by='seller_id')
k<-left_join(c, s, by='order_id')
N<-left_join(k, payments, by='order_id')
df<-left_join(N, review, by='order_id')

3.2 Remove Missing Values

na_smry<-summary(is.na(df))
na_smry
##  customer_id     customer_unique_id customer_zip_code_prefix customer_city  
##  Mode :logical   Mode :logical      Mode :logical            Mode :logical  
##  FALSE:119143    FALSE:119143       FALSE:119143             FALSE:119143   
##                                                                             
##  customer_state   order_id       order_status    order_purchase_timestamp
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical           
##  FALSE:119143    FALSE:119143    FALSE:119143    FALSE:119143            
##                                                                          
##  order_approved_at order_delivered_carrier_date order_delivered_customer_date
##  Mode :logical     Mode :logical                Mode :logical                
##  FALSE:119143      FALSE:119143                 FALSE:119143                 
##                                                                              
##  order_estimated_delivery_date order_item_id   product_id      seller_id      
##  Mode :logical                 Mode :logical   Mode :logical   Mode :logical  
##  FALSE:119143                  FALSE:118310    FALSE:118310    FALSE:118310   
##                                TRUE :833       TRUE :833       TRUE :833      
##  shipping_limit_date   price         freight_value   product_category_name
##  Mode :logical       Mode :logical   Mode :logical   Mode :logical        
##  FALSE:118310        FALSE:118310    FALSE:118310    FALSE:118310         
##  TRUE :833           TRUE :833       TRUE :833       TRUE :833            
##  product_name_lenght product_description_lenght product_photos_qty
##  Mode :logical       Mode :logical              Mode :logical     
##  FALSE:116601        FALSE:116601               FALSE:116601      
##  TRUE :2542          TRUE :2542                 TRUE :2542        
##  product_weight_g product_length_cm product_height_cm product_width_cm
##  Mode :logical    Mode :logical     Mode :logical     Mode :logical   
##  FALSE:118290     FALSE:118290      FALSE:118290      FALSE:118290    
##  TRUE :853        TRUE :853         TRUE :853         TRUE :853       
##  seller_zip_code_prefix seller_city     seller_state    payment_sequential
##  Mode :logical          Mode :logical   Mode :logical   Mode :logical     
##  FALSE:118310           FALSE:118310    FALSE:118310    FALSE:119140      
##  TRUE :833              TRUE :833       TRUE :833       TRUE :3           
##  payment_type    payment_installments payment_value   review_id      
##  Mode :logical   Mode :logical        Mode :logical   Mode :logical  
##  FALSE:119140    FALSE:119140         FALSE:119140    FALSE:118146   
##  TRUE :3         TRUE :3              TRUE :3         TRUE :997      
##  review_score    review_comment_title review_comment_message
##  Mode :logical   Mode :logical        Mode :logical         
##  FALSE:118146    FALSE:118146         FALSE:118146          
##  TRUE :997       TRUE :997            TRUE :997             
##  review_creation_date review_answer_timestamp
##  Mode :logical        Mode :logical          
##  FALSE:118146         FALSE:118146           
##  TRUE :997            TRUE :997
df<-drop_na(df, names(df))
nrow(df)
## [1] 115633

3.3 Time Dealing

df$year_order_purchase_timestamp<-year(df$order_purchase_timestamp)
df$month_order_purchase_timestamp<-month(df$order_purchase_timestamp)
df$day_order_purchase_timestamp<-day(df$order_purchase_timestamp)

df$year_order_approved_at<-year(as.Date(df$order_approved_at))
df$month_order_approved_at<-month(as.Date(df$order_approved_at))
df$day_order_approved_at<-day(as.Date(df$order_approved_at))

df$year_order_delivered_carrier_date<-year(as.Date(df$order_delivered_carrier_date))
df$month_order_delivered_carrier_date<-month(as.Date(df$order_delivered_carrier_date))
df$day_order_delivered_carrier_date<-day(as.Date(df$order_delivered_carrier_date))

df$year_order_delivered_customer_date<-year(as.Date(df$order_delivered_customer_date))
df$month_order_delivered_customer_date<-month(as.Date(df$order_delivered_customer_date))
df$day_order_delivered_customer_date<-day(as.Date(df$order_delivered_customer_date))

df$year_order_estimated_delivery_date<-year(as.Date(df$order_estimated_delivery_date))
df$month_order_estimated_delivery_date<-month(as.Date(df$order_estimated_delivery_date))
df$day_order_estimated_delivery_date<-day(as.Date(df$order_estimated_delivery_date))

df$year_review_creation_date<-year(as.Date(df$review_creation_date))
df$month_review_creation_date<-month(as.Date(df$review_creation_date))
df$day_review_creation_date<-day(as.Date(df$review_creation_date))

df$year_review_answer_timestamp<-year(as.Date(df$review_answer_timestamp))
df$month_review_answer_timestamp<-month(as.Date(df$review_answer_timestamp))
df$day_review_answer_timestamp<-day(as.Date(df$review_answer_timestamp))

df$year_order_delivered_carrier_date<-year(as.Date(df$order_delivered_carrier_date))
df$month_order_delivered_carrier_date<-month(as.Date(df$order_delivered_carrier_date))
df$day_order_delivered_carrier_date<-day(as.Date(df$order_delivered_carrier_date))

df$year_shipping_limit_date<-year(as.Date(df$shipping_limit_date))
df$month_shipping_limit_date<-month(as.Date(df$shipping_limit_date))
df$day_shipping_limit_date<-day(as.Date(df$shipping_limit_date))

4. Exploratory Data Analysis

4.1 Location Analysis(State)

4.1.1 Customers Location(State)

As shown below, more than 40% customers are from Sao Paulo(SP), which is much higher than other states. And the second one is Rio de Janeiro(RJ).

customer_state_counts <- sqldf("
                               SELECT customer_state, COUNT(customer_state) AS counts
                               FROM df
                               GROUP BY customer_state
                               ORDER BY counts DESC
                               ")
# bar chart
customer_state_counts$percent <- round((customer_state_counts$counts/sum(customer_state_counts$counts))*100, 2)
options(repr.plot.width = 40, repr.plot.height =20)
ggplot(customer_state_counts, aes(x=reorder(customer_state, counts), y=counts)) +
  geom_bar(stat='identity', fill = '#00BD1F', alpha=.8) +
  geom_text(aes(label = paste0(percent, '%')), vjust = -0.9, size=2) +
  xlab('State Name') +
  ylab('Counts') +
  ggtitle('Number of Customer State') +
  theme_light() +
  theme(axis.text = element_text(size = 10),
        axis.title.x = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        plot.title = element_text(size = 20, face = 'bold'))

4.1.2 Sellers Location(State)

In the bar chart, almost 60% sellers are from Sao Paulo(SP), and the second one is Paraná(PR).

seller_state_counts <- sqldf("
                            SELECT seller_state,
                              COUNT(seller_state) AS counts
                            FROM df
                            GROUP BY seller_state
                            ORDER BY counts DESC
                            ")
# bar chart
seller_state_counts$percent <- round((seller_state_counts$counts/sum(seller_state_counts$counts))*100, 2)
ggplot(seller_state_counts, aes(x=reorder(seller_state, counts), y=counts)) +
  geom_bar(stat='identity', fill = '#4564FF', alpha=.8) +
  geom_text(aes(label = paste0(percent, '%')), vjust = -0.9, size=2) +
  xlab('State Name') +
  ylab('Counts') +
  ggtitle('Number of Seller State') +
  theme_light() +
  theme(axis.text = element_text(size = 10),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 15),
        plot.title = element_text(size = 20, face = 'bold'))

4.2 Payment Analysis

4.2.1 Order Payments

There are many payment type currently, so make a donut chart to understand what is the most popular payment type in this ecommerce platform.

As you can see from the graph, credit cards are the most popular method of payment.

payment_type <- sqldf("
                      SELECT payment_type AS type,
                        COUNT(payment_type) AS counts,
                        ROUND(CAST(COUNT(payment_type) AS REAL)*100/CAST((SELECT COUNT(*) FROM df) AS REAL), 2) AS percent
                      FROM df
                      WHERE type != 'not_defined'
                      GROUP BY type
                      ORDER BY counts
                      ")
# donut chart
type_eng <- c('Debit card', 'Voucher', 'Boleto(Ticket)', 'Credit card')

payment_type %>%
    mutate(type_eng=type_eng) %>%
    mutate(ymax=cumsum(percent/100)) %>%
    mutate(ymin=c(0, head(ymax, n=-1))) %>%
    mutate(labelPosition=(ymax+ymin)/2) %>%
    mutate(label=paste0(percent, '%')) %>%
    ggplot(aes(ymax=ymax, ymin=ymin, xmax=5, xmin=3, fill=type_eng)) +
      geom_rect(color='white') +
      geom_text(x=4, aes(y=labelPosition, label=label), size=3) +
      coord_polar(theta = "y") +
      xlim(c(-1, 5)) +
      ggtitle('Customer Payment Type') +
      scale_fill_brewer('Payment Type', palette="RdYlGn") +
      theme_void() +
      theme(legend.key.size = unit(2, 'cm'),
        legend.title = element_text(size=10),
        legend.text = element_text(size=10),
        plot.title = element_text(size = 20, face = 'bold')) +
      annotate(geom = 'text', x = -1, y = 0, size=5, 
           label = paste0('Total:\n', sum(payment_type$counts)))

4.3 Orders Information

4.3.1 Order Delivered Time

In this part, we find different information about the orders dataframe, such as the number of orders each year or the delivery time of each order. In the donut chart, the largest part is 2018 and the second one is 2017 then 2016. So we can easily find the order number was continously growing up.

Orders$purchased <- as.Date(Orders$order_purchase_timestamp)
Orders$delivered <- as.Date(Orders$order_delivered_customer_date)

delivery <- sqldf("
                  SELECT order_status, 
                    CAST(CAST(order_purchase_timestamp AS DATETIME) AS CHAR) AS year, 
                    order_purchase_timestamp AS purchase_date,
                    order_delivered_customer_date AS delivered_date, 
                    delivered-purchased AS delivered_days
                  FROM Orders
                  WHERE order_status = 'delivered'
                    AND delivered_days != 'NA'
                  ORDER BY delivered_days DESC
                  ")
# statistics
delivery_stat <- sqldf("
                       SELECT MAX(delivered_days) AS max,
                        MIN(delivered_days) AS min,
                        AVG(delivered_days) AS mean,
                        MEDIAN(delivered_days) AS median,
                        STDEV(delivered_days) AS standard_deviation
                      FROM delivery
                       ")
delivery_stat
##   max min     mean median standard_deviation
## 1 210   0 12.49685     10           9.555071
# donut chart
delivery %>%
  select(year) %>%
  count(year) %>%
  mutate(ymax = cumsum(n/sum(n))) %>%
  mutate(ymin = c(0, head(ymax, -1))) %>%
  mutate(labelPosition = (ymax+ymin)/2) %>%
  mutate(label = paste0(round(labelPosition*100, 2), '%')) %>%
  ggplot(aes(ymax=ymax, ymin=ymin, xmax=5, xmin=3, fill=year)) +
  geom_rect(color='white') +
  geom_text(x=4, aes(y=labelPosition, label=label), size=3.5,
            alpha=c(0, rep(1, 2))) +
  geom_text(x=5.3, aes(y=labelPosition[1], label=label[1]), size=3.5) +
  coord_polar(theta = "y") +
  xlim(c(-1, 5)) +
  ggtitle('Year of Orders') +
  scale_fill_brewer('Year', palette="Set1") +
  theme_void() +
  theme(legend.key.size = unit(2, 'cm'),
        legend.title = element_text(size=15),
        legend.text = element_text(size=10),
        plot.title = element_text(size = 20, face = 'bold')) +
  annotate(geom = 'text', x = -1, y = 0, size=5, 
           label = paste0('Total:\n', dim(delivery)[1]))

4.3.2 Order Reviews

Make a donut chart and bar chart to see the review score. Although most reviews are very high, the 1 score part is about 12%.

review_score <- sqldf("
                      SELECT CAST(review_score AS CHAR) AS score, 
                        COUNT(review_score) AS counts,
                        ROUND(CAST(COUNT(review_score) AS REAL)*100/CAST((SELECT COUNT(*) FROM review) AS REAL), 2) AS percent
                      FROM review
                      GROUP BY score
                      ORDER BY counts
                      ")
# donut chart
review_score %>%
    mutate(ymax=cumsum(percent/100)) %>%
    mutate(ymin=c(0, head(ymax, n=-1))) %>%
    mutate(labelPosition=(ymax+ymin)/2) %>%
    mutate(label=paste0(percent, '%')) %>%
    ggplot(aes(ymax=ymax, ymin=ymin, xmax=5, xmin=3, fill=score)) +
      geom_rect(color='white') +
      geom_text(x=4, aes(y=labelPosition, label=label), size=3.5) +
      coord_polar(theta = "y") +
      xlim(c(-1, 5)) +
      ggtitle('Customer Review Score') +
      scale_fill_brewer('Score', palette="RdYlGn") +
      theme_void() +
      theme(legend.key.size = unit(2, 'cm'),
            legend.title = element_text(size=10),
            legend.text = element_text(size=10),
            plot.title = element_text(size = 20, face = 'bold')) +
      annotate(geom = 'text', x = -1, y = 0, size=10, 
               label = paste0('Total:\n', sum(review_score$counts)))

ggplot(review_score, aes(x=score, y=counts)) +
  geom_bar(stat="identity", fill="#FFDD00", width=.4) +
  geom_text(aes(label = paste0(percent, '%')), vjust = -0.5, hjust=0.4, size=4) +
  ylim(0, 65000) +
  xlab("Score") +
  ylab("Counts") +
  ggtitle("Percentage of Review Score from Customers") +
  theme_light() +
  theme(axis.text = element_text(size = 10),
        axis.title.x = element_text(size = 13, vjust = -0.3),
        axis.title.y = element_text(size = 13),
        plot.title = element_text(size = 20, face = 'bold'))

4.4 Stimulation For Customers/Product Categories/Sellers

4.4.1 Stimulation For Customers

The graph shows, in 2017, there has been a positive trend line in number of New Customers [Customer Unique Identity] getting registered with Olist. In 2018, more than 6000 were getting registered every month.

Orders1 <- mutate(Orders, Est.Lead.t = interval(ymd_hms(Orders$order_purchase_timestamp), ymd_hms(Orders$order_estimated_delivery_date)) %/% days(x = 1))
Orders1 <- mutate(Orders1, Act.Lead.t = interval(ymd_hms(Orders1$order_purchase_timestamp), ymd_hms(Orders$order_delivered_customer_date)) %/% days(x = 1))
Orders1 <- mutate(Orders1, Approval.mins = interval(ymd_hms(Orders1$order_purchase_timestamp), ymd_hms(Orders$order_approved_at)) %/% minutes(x = 1))
Orders1 <- mutate(Orders1, Appr.to.Car = interval(ymd_hms(Orders1$order_purchase_timestamp), ymd_hms(Orders$order_delivered_carrier_date)) %/% days(x = 1))
Orders1 <- mutate(Orders1, Car.to.Cus = interval(ymd_hms(Orders1$order_delivered_carrier_date), ymd_hms(Orders$order_delivered_customer_date)) %/% days(x = 1))
Orders1 <- mutate(Orders1, purchase_quart = quarter(Orders1$order_purchase_timestamp, with_year = TRUE))
Orders1 <- mutate(Orders1, purchase_month = month(Orders1$order_purchase_timestamp, label = TRUE, abbr = FALSE))
Orders1 <- mutate(Orders1, purchase_year = year(Orders1$order_purchase_timestamp))

Items1 <- mutate(Items, order.price = (order_item_id*price))
Items1 <- mutate(Items1, order.freight = (order_item_id*freight_value))
Items1 <- mutate(Items1, total.order.cost = (order.price + order.freight))
Items1 <- mutate(Items1, fr.sup.pr = if_else(freight_value >= price, 1, 0))
Cust.gr <- inner_join(Orders1, customer, by = "customer_id")
Cust.gr %>% select_at(vars(purchase_year, purchase_month, customer_unique_id)) %>% distinct_at(vars(purchase_year, purchase_month, customer_unique_id)) %>% arrange_at(vars(purchase_year, purchase_month)) %>% count_(vars(purchase_year, purchase_month)) %>% ggplot(aes(y = n, x = as.factor(purchase_month), label = n, color = factor(purchase_year)))+
  geom_point(size = 5, color = 'navy', shape = 21, alpha = 0.54)+
  geom_point(size = 3.75, color = 'navy', shape = 21, alpha = 0.72)+
  geom_label(aes(fill = factor(purchase_year)), colour = "white", fontface = "bold", vjust = 1, size = 3.25)+
  facet_grid(.~purchase_year)+
              theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = c(0.1,0.7))+
  labs(title = "Trend of new customer registration per month", x = "Months", y = "Number of Customers", fill = "Purchase Year")
## Warning: `count_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `count()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <]8;;https://github.com/tidyverse/dplyr/issueshttps://github.com/tidyverse/dplyr/issues]8;;>.

4.4.2 Stimulation For Products

The diagram below illustrates that, if addition 50 New Product Categories per month,this will see a trend of customer growth. Consistency in this phenomenon was Olist’s byword for 20 months i.e. from January 2017 to August 2018.

Products <- left_join(products, nametrans, by = "product_category_name")
Items1 <- left_join(Items1, Products, by = "product_id")
Pdt.Gr <- inner_join(Orders1, Items1, by = "order_id")

Pdt.Gr %>% select_at(vars(purchase_year, purchase_month, product_category_name_english)) %>% distinct_at(vars(purchase_year, purchase_month, product_category_name_english)) %>% arrange_at(vars(purchase_year, purchase_month)) %>% count_(vars(purchase_year, purchase_month)) %>% ggplot(aes(x = factor(purchase_month), y = n))+
  geom_point(color = "darkorchid", alpha = 0.72, shape = 21, aes(size = n), fill = alpha("snow2", 0.3))+
  geom_point(color = "darkorchid", alpha = 0.81, shape = 21, aes(size = n*0.6), fill = alpha("snow2", 0.3))+
  geom_segment(aes(x = purchase_month, xend = purchase_month, y = 0, yend = n), alpha = 0.45, linetype = 3, color = "darkorchid")+
  coord_polar(start = 2)+
  facet_wrap(.~purchase_year)+
  theme_minimal()+
  theme(axis.text = element_text(size = 8), panel.grid.major = element_line(), legend.position = "bottom")+
  labs(title = "New Product categories introduced per month", x = "Months", y = "Number of Product Categories")

4.4.2 Stimulation For Sellers

Analysis of the graph below shows that sellers too maintained the similar trend as that of customers, and within the time frame of this data, Olist ended up having +3095 registered sellers on it’s platform.

Seller.Gr <- inner_join(Orders1, left_join(Items1, sellers, by = "seller_id"), by = "order_id")

Seller.Gr %>% select_at(vars(purchase_year, purchase_month, seller_id)) %>% distinct_at(vars(purchase_year, purchase_month, seller_id)) %>% arrange_at(vars(purchase_year, purchase_month)) %>% count_(vars(purchase_year, purchase_month)) %>% ggplot(aes(x = factor(purchase_month), y = n))+
  geom_point(color = "red", alpha = 0.63, shape = 21, aes(size = n), fill = alpha("cornsilk", 0.3))+
  geom_point(color = "red", alpha = 0.81, shape = 21, aes(size = n*0.6), fill = alpha("cornsilk", 0.3))+
  geom_segment(aes(x = purchase_month, xend = purchase_month, y = 0, yend = n), alpha = 0.45, linetype = 3, color = "red")+
  coord_polar(start = 2)+
  facet_wrap(.~purchase_year)+
  theme_minimal()+
  theme(axis.text = element_text(size = 8), panel.grid.major = element_line(), legend.position = "bottom")+
  labs(title = "Increase in number of sellers per month", x = "Months", y = "Number of Sellers")

4.5 Products

First, add the English category name into the product dataframe. And multiply the columns of length, width, and height as product volume.

index_no_translation <- which(!(products$product_category_name %in% nametrans$original_name) & products$product_category_name!='')
category_no_translation <- unique(products$product_category_name[index_no_translation])
nametrans[nrow(nametrans)+1, ] <- c('pc_gamer', 'pc_gamer')
nametrans[nrow(nametrans)+1, ] <- c('portateis_cozinha_e_preparadores_de_alimentos', 'portable_kitchen_and_food_preparation')

category <- sqldf("
                  SELECT df.product_category_name AS original_name, 
                    nametrans.product_category_name_english,
                    df.product_weight_g AS weight,
                    product_length_cm*product_height_cm*product_width_cm AS volume
                  FROM df
                  LEFT JOIN nametrans
                    ON df.product_category_name = nametrans.product_category_name
                  WHERE df.product_category_name != ''
                  ")

4.5.1 Statestic Tables About Products

In the following, list the statestic tables people may be curious, such as which one category is the most popular in the Olist platform and which is most heaviest or largest.

most5 <- sqldf("
               SELECT product_category_name_english,
                 COUNT(product_category_name_english) AS counts
               FROM category
               GROUP BY product_category_name_english
               ORDER BY counts DESC
               LIMIT 5
               ")
most5
##   product_category_name_english counts
## 1                bed_bath_table  11847
## 2                 health_beauty   9944
## 3                sports_leisure   8942
## 4               furniture_decor   8743
## 5         computers_accessories   8105
least5 <- sqldf("
                SELECT product_category_name_english, 
                  COUNT(product_category_name_english) AS counts
                FROM category
                GROUP BY product_category_name_english
                ORDER BY counts
                LIMIT 5
                ")
least5
##   product_category_name_english counts
## 1         security_and_services      2
## 2     fashion_childrens_clothes      8
## 3                      pc_gamer     10
## 4             cds_dvds_musicals     14
## 5                    la_cuisine     15
heavy5 <- sqldf("
                SELECT product_category_name_english,
                  AVG(weight) AS weight_group_mean
                FROM category
                GROUP BY product_category_name_english
                ORDER BY weight_group_mean DESC
                LIMIT 5
                ")
heavy5
##             product_category_name_english weight_group_mean
## 1                        office_furniture         11281.065
## 2                       furniture_bedroom         10124.405
## 3                       home_appliances_2          9888.052
## 4 kitchen_dining_laundry_garden_furniture          8670.963
## 5       furniture_mattress_and_upholstery          8446.341
light5 <- sqldf("
                SELECT product_category_name_english,
                  AVG(weight) AS weight_group_mean
                FROM category
                GROUP BY product_category_name_english
                ORDER BY weight_group_mean
                LIMIT 5
                ")
light5
##   product_category_name_english weight_group_mean
## 1                     telephony          262.0009
## 2     fashion_childrens_clothes          265.0000
## 3       fashion_underwear_beach          269.7552
## 4        tablets_printing_image          292.4118
## 5                 fashion_sport          335.4839
large5 <- sqldf("
                SELECT product_category_name_english,
                  AVG(volume) AS volume_group_mean
                FROM category
                GROUP BY product_category_name_english
                ORDER BY volume_group_mean DESC
                LIMIT 5
                ")
large5
##             product_category_name_english volume_group_mean
## 1                        office_furniture          73454.07
## 2       furniture_mattress_and_upholstery          68050.59
## 3                       home_appliances_2          53813.25
## 4 kitchen_dining_laundry_garden_furniture          51475.60
## 5                       furniture_bedroom          47393.85
small5 <- sqldf("
                SELECT product_category_name_english,
                  AVG(volume) AS volume_group_mean
                FROM category
                GROUP BY product_category_name_english
                ORDER BY volume_group_mean
                LIMIT 5
                ")
small5
##   product_category_name_english volume_group_mean
## 1                     telephony          1803.015
## 2                  dvds_blu_ray          1889.743
## 3                books_imported          2076.500
## 4                 watches_gifts          2834.818
## 5               books_technical          3174.119

4.5.2 Products Purchased Multiple Times

Analysis by results, all Product Categories were purchased multiple times. The graph below shows which categories are high frequency categories.

Pdt.Gr %>% group_by_at(vars(purchase_year, purchase_month, product_category_name_english)) %>% filter(!is.na(product_category_name_english))%>% count(product_category_name_english) %>% arrange(desc(n)) %>% filter(!is.na(product_category_name_english) & n >1) %>% ggplot(aes(x = product_category_name_english, y = n))+
  geom_boxplot(color = "blueviolet", alpha = 0.63)+
  coord_flip()+
  facet_wrap(.~reorder(purchase_year, desc(purchase_year)), scales = "free_x")+
  theme_minimal()+
  scale_y_reverse()+
  theme(axis.text.y = element_text(size = 7.25), panel.grid.major.x = element_line(color = "white"), panel.grid.minor.x = element_line(color = "white"), panel.grid.major.y =  element_line(color = "grey95", linetype = 3))+
  labs(title = "Categories purchased at high frequency per year", x = "Product Categories", y = "Number of times Product Category purchased")

5. Predictive Analysis

5.1 Classifcation Problem

We would like to predict if an order will be delivered or not (classification problem) using Random Forest and predict the freight value(Regression problem) using linear regression

5.1.0 Data Transformation

Here we will do data prepreocessing such as removing missing values , one hot encoding and choosing target variables

table(df$order_status)
## 
##    approved    canceled   delivered    invoiced  processing     shipped 
##           3         538      113232         358         357        1138 
## unavailable 
##           7
# target label preprocessing
df[which(df$order_status!='delivered'),]$order_status<-1
df[which(df$order_status=='delivered'),]$order_status<-2

df3<-select(df, -c('customer_id', 'customer_unique_id','order_id','order_item_id', 'product_id',
                  'seller_id','review_id','review_comment_title','review_comment_message','customer_city','customer_state',
                'seller_city','seller_state','payment_type','order_purchase_timestamp',
                  'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 
                  'order_estimated_delivery_date','review_creation_date',
                  'review_answer_timestamp','shipping_limit_date','product_category_name' ))
#back up
backup<- df
df<-backup
# one-hot encoding will conver categorical columns into numnerical 
library('caret')
df_dummy<-select(df, c('customer_state', 'order_status', 'product_category_name', 'seller_city', 'seller_state', 'payment_type'))
dum<-dummyVars( ~ customer_state, data=df_dummy)
processed<-data.frame(predict(dum, newdata=df_dummy))
df_dummy<-cbind(df_dummy, processed)


dum<-dummyVars( ~ product_category_name, data=df_dummy)
processed<-data.frame(predict(dum, newdata=df_dummy))
df_dummy<-cbind(df_dummy, processed)

unique(df_dummy$seller_state)
##  [1] "SP" "SC" "RJ" "PR" "MG" "ES" "RS" "DF" "PE" "MA" "BA" "GO" "MT" "MS" "RO"
## [16] "PB" "CE" "PA" "RN" "PI" "SE" "AC" "AM"
dum<-dummyVars( ~ seller_state, data=df_dummy)
processed<-data.frame(predict(dum, newdata=df_dummy))
df_dummy<-cbind(df_dummy, processed)
# df3 contains many null data from Date attributes, and most of them are label 1. So we cannot drop them
df3[is.na(df3)]<-0

# drop unnecessary attributeslst()# drop unnecessary attributes
df_dummy<-select(df_dummy, -c('customer_state', 'product_category_name', 'seller_city', 'seller_state', 'payment_type'))
lst<-cbind(df3, df_dummy)

5.1.1 Correlational Analysis

Here we will perform correlational analysis . We would like to identify attributes that has impact on the target variable ‘order_status’

# drop missing value
na_smry<-summary(is.na(lst))
lst<-drop_na(lst, names(lst))
#
data_cor <-cor(lst[ , colnames(lst) !="order_status"],  as.integer(lst$order_status))
x=data_cor >= abs(0.05)
sum(x, na.rm=TRUE)
## [1] 7
which(data_cor >= abs(0.05))
## [1] 15 22 23 24 25 26 27
# show all the correlated variables which is higher than 0.05
x[which(x==TRUE),]
##                        review_score   year_order_delivered_carrier_date 
##                                TRUE                                TRUE 
##  month_order_delivered_carrier_date    day_order_delivered_carrier_date 
##                                TRUE                                TRUE 
##  year_order_delivered_customer_date month_order_delivered_customer_date 
##                                TRUE                                TRUE 
##   day_order_delivered_customer_date 
##                                TRUE
# select all the correlated data
df4= select(df3,c('review_score','year_order_delivered_carrier_date', 'month_order_delivered_carrier_date',                                   
                  'day_order_delivered_carrier_date', 'year_order_delivered_customer_date','month_order_delivered_customer_date',                                 
                  'day_order_delivered_customer_date','order_status')) 

# convert the target variable into factor 

df4$order_status<- as.factor(df4$order_status) 

5.1.2 Random Forest Classifier

here we will import the Random forest which will help us solve the classification problem ‘how to predit if an order will be delivered or not?’ we split the dataset into training and testing and fit the model into the dataset

# load those libraries
library(stats)
library(dplyr)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caTools)

# Splitting data in train and test data
split <- sample.split(df4, SplitRatio = 0.7)
train <- subset(df4, split == "TRUE")
test <- subset(df4, split == "FALSE")

# Fitting Random Forest to the train dataset
set.seed(120)  # Setting seed
classifier_RF = randomForest(x = train[-8],
                             y = train$order_status,
                             ntree = 100)

5.1.3 Model Evaluation

The model has performed very well . The accuracy is very high 99.99% .This is because the dataset is non linear and RF performs well in such dataset

# Evaluate the model 
status_pred = predict(classifier_RF, test) # will use RFM to predict Testing 
test$status_pred = status_pred

# building the confusion matrix 
CFM = table(test$order_status,test$status_pred)
CFM
##    
##         1     2
##   1   947     4
##   2     5 42406
# show the accuracy for the classification
classification_Accuracy = sum(diag(CFM)/sum(CFM)) 
classification_Accuracy
## [1] 0.9997924

5.2 Regression Problem

here we will predict the freight value of an order .We will use linear Regression to solve this problem . First we will perform correlational analysis to see the attributes that impact the target variable Freight value

#### 5.2.0 Correlational Analysis

Here we will perform correlational analysis . We would like to identify attributes that has impact on the target variable ‘freight_value’

# calculate the correlated attributes with freight value.
# freight value is the next target for the regression.
lst<-lst[ , colnames(lst) != "order_status"]
data_cor <-cor(lst,  lst$freight_value)
x=data_cor >= abs(0.05)
sum(x, na.rm=TRUE)
## [1] 35
which(data_cor >= abs(0.05))
##  [1]   1   2   3   5   7   8   9  10  11  13  14  41  44  45  49  52  53  54  55
## [20]  56  59  60  64  66  96 120 122 123 124 128 142 148 155 159 160
# show all the correlated attributes whihc is higher than 0.05
x[which(x==TRUE),]
##                                            customer_zip_code_prefix 
##                                                                TRUE 
##                                                               price 
##                                                                TRUE 
##                                                       freight_value 
##                                                                TRUE 
##                                          product_description_lenght 
##                                                                TRUE 
##                                                    product_weight_g 
##                                                                TRUE 
##                                                   product_length_cm 
##                                                                TRUE 
##                                                   product_height_cm 
##                                                                TRUE 
##                                                    product_width_cm 
##                                                                TRUE 
##                                              seller_zip_code_prefix 
##                                                                TRUE 
##                                                payment_installments 
##                                                                TRUE 
##                                                       payment_value 
##                                                                TRUE 
##                                                    customer_stateAL 
##                                                                TRUE 
##                                                    customer_stateBA 
##                                                                TRUE 
##                                                    customer_stateCE 
##                                                                TRUE 
##                                                    customer_stateMA 
##                                                                TRUE 
##                                                    customer_stateMT 
##                                                                TRUE 
##                                                    customer_statePA 
##                                                                TRUE 
##                                                    customer_statePB 
##                                                                TRUE 
##                                                    customer_statePE 
##                                                                TRUE 
##                                                    customer_statePI 
##                                                                TRUE 
##                                                    customer_stateRN 
##                                                                TRUE 
##                                                    customer_stateRO 
##                                                                TRUE 
##                                                    customer_stateSE 
##                                                                TRUE 
##                                                    customer_stateTO 
##                                                                TRUE 
##                             product_category_nameeletrodomesticos_2 
##                                                                TRUE 
## product_category_namemoveis_cozinha_area_de_servico_jantar_e_jardim 
##                                                                TRUE 
##                              product_category_namemoveis_escritorio 
##                                                                TRUE 
##                                  product_category_namemoveis_quarto 
##                                                                TRUE 
##                                    product_category_namemoveis_sala 
##                                                                TRUE 
##                                            product_category_namepcs 
##                                                                TRUE 
##                                                      seller_stateBA 
##                                                                TRUE 
##                                                      seller_stateMG 
##                                                                TRUE 
##                                                      seller_statePR 
##                                                                TRUE 
##                                                      seller_stateRS 
##                                                                TRUE 
##                                                      seller_stateSC 
##                                                                TRUE
# extract all the correlated attributes to generate a new data frame
df5<-lst[ , which(data_cor >= abs(0.05))]

5.2.1 Linear Regression Model

here we will import the model , split the dataset and fit the model into the dataset

# fix a regression model and see it's R-Square to assess

# Splitting data in train and test data
split <- sample.split(df5, SplitRatio = 0.7)

train <- subset(df5, split == "TRUE")
test <- subset(df5, split == "FALSE")
set.seed(120)  # Setting seed
lm_model<-lm(freight_value ~.,data=train)
summary(lm_model)$adj.r.squared
## [1] 0.5551996

5.2. Evaluation of the Regression Model

Linear Regression has performed poorly on this dataset( MSE is 111 and Adjusted R squared = 0.54). This is because the dataset is non linear

# calculate the Mean Square Error of the prediction performance and the model itself
test$pred<-predict(lm_model, newdata=test)
MSE<-mean((test$freight_value - test$pred)^2)
print(MSE)
## [1] 116.9281
print(mean(lm_model$residuals^2))
## [1] 110.8917