In this project we will study the Brazillian E-commerce data.After that we will perform some data preprocessing , Exploratory Data Analysis, and Predictive Analyses Using Machine Learning Models .
Customers Data: This dataset has information about the customer and its location. Each order is assigned to a unique customerid. This means that the same customer will get different ids for different orders. The purpose of having a customerunique_id on the dataset is to allow you to identify customers that made repurchases at the store.
Geolocation Data: This dataset has information Brazilian zip codes and its lat/lng coordinates. Use it to plot maps and find distances between sellers and customers.
Order Items Dataset: This dataset includes data about the items purchased within each order
Order Paymets Dataset: This dataset includes data about the orders payment options.
Order Reviews Dataset: This dataset includes data about the reviews made by the customers.
Order Dataset: This is the core dataset. From each order you might find all other information.
Product Dataset: This dataset includes data about the products sold by Olist.
Sellers Dataset: This dataset includes data about the sellers that fulfilled orders made at Olist.
Use it to find the seller location and to identify which seller fulfilled each product.
Product Category Name translation: Translates the productcategoryname to english.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## Loading required package: timechange
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(knitr)
library(lattice)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.8 ✔ purrr 0.3.4
## ✔ tidyr 1.1.4 ✔ stringr 1.5.0
## ✔ readr 2.1.1 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ lubridate::union() masks base::union()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: '/opt/X11/lib/libSM.6.dylib'
## Referenced from: '/Library/Frameworks/R.framework/Versions/4.1/Resources/modules/R_X11.so'
## Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk1.8.0_241.jdk/Contents/Home/jre/lib/server/libSM.6.dylib' (no such file)
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(ggrepel)
library(geodata)
## Loading required package: terra
## terra 1.5.21
##
## Attaching package: 'terra'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:knitr':
##
## spin
## The following object is masked from 'package:ggplot2':
##
## arrow
## The following object is masked from 'package:dplyr':
##
## src
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
# read data
review<-read.csv("olist_order_reviews_dataset.csv", header = TRUE)
Orders<-read.csv("olist_orders_dataset.csv", header = TRUE)
products<-read.csv("olist_products_dataset.csv", header = TRUE)
sellers<-read.csv("olist_sellers_dataset.csv", header = TRUE)
customer<-read.csv("olist_customers_dataset.csv", header = TRUE)
geolocation<-read.csv("olist_geolocation_dataset.csv", header = TRUE)
payments<-read.csv("olist_order_payments_dataset.csv", header = TRUE)
Items<-read.csv("olist_order_items_dataset.csv", header = TRUE)
nametrans<-read.csv("product_category_name_translation.csv", header = TRUE)
c<-left_join(customer, Orders, by='customer_id')
x<-left_join(Items, products, by='product_id')
s<-left_join(x, sellers, by='seller_id')
k<-left_join(c, s, by='order_id')
N<-left_join(k, payments, by='order_id')
df<-left_join(N, review, by='order_id')
na_smry<-summary(is.na(df))
na_smry
## customer_id customer_unique_id customer_zip_code_prefix customer_city
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:119143 FALSE:119143 FALSE:119143 FALSE:119143
##
## customer_state order_id order_status order_purchase_timestamp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:119143 FALSE:119143 FALSE:119143 FALSE:119143
##
## order_approved_at order_delivered_carrier_date order_delivered_customer_date
## Mode :logical Mode :logical Mode :logical
## FALSE:119143 FALSE:119143 FALSE:119143
##
## order_estimated_delivery_date order_item_id product_id seller_id
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:119143 FALSE:118310 FALSE:118310 FALSE:118310
## TRUE :833 TRUE :833 TRUE :833
## shipping_limit_date price freight_value product_category_name
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:118310 FALSE:118310 FALSE:118310 FALSE:118310
## TRUE :833 TRUE :833 TRUE :833 TRUE :833
## product_name_lenght product_description_lenght product_photos_qty
## Mode :logical Mode :logical Mode :logical
## FALSE:116601 FALSE:116601 FALSE:116601
## TRUE :2542 TRUE :2542 TRUE :2542
## product_weight_g product_length_cm product_height_cm product_width_cm
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:118290 FALSE:118290 FALSE:118290 FALSE:118290
## TRUE :853 TRUE :853 TRUE :853 TRUE :853
## seller_zip_code_prefix seller_city seller_state payment_sequential
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:118310 FALSE:118310 FALSE:118310 FALSE:119140
## TRUE :833 TRUE :833 TRUE :833 TRUE :3
## payment_type payment_installments payment_value review_id
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:119140 FALSE:119140 FALSE:119140 FALSE:118146
## TRUE :3 TRUE :3 TRUE :3 TRUE :997
## review_score review_comment_title review_comment_message
## Mode :logical Mode :logical Mode :logical
## FALSE:118146 FALSE:118146 FALSE:118146
## TRUE :997 TRUE :997 TRUE :997
## review_creation_date review_answer_timestamp
## Mode :logical Mode :logical
## FALSE:118146 FALSE:118146
## TRUE :997 TRUE :997
df<-drop_na(df, names(df))
nrow(df)
## [1] 115633
df$year_order_purchase_timestamp<-year(df$order_purchase_timestamp)
df$month_order_purchase_timestamp<-month(df$order_purchase_timestamp)
df$day_order_purchase_timestamp<-day(df$order_purchase_timestamp)
df$year_order_approved_at<-year(as.Date(df$order_approved_at))
df$month_order_approved_at<-month(as.Date(df$order_approved_at))
df$day_order_approved_at<-day(as.Date(df$order_approved_at))
df$year_order_delivered_carrier_date<-year(as.Date(df$order_delivered_carrier_date))
df$month_order_delivered_carrier_date<-month(as.Date(df$order_delivered_carrier_date))
df$day_order_delivered_carrier_date<-day(as.Date(df$order_delivered_carrier_date))
df$year_order_delivered_customer_date<-year(as.Date(df$order_delivered_customer_date))
df$month_order_delivered_customer_date<-month(as.Date(df$order_delivered_customer_date))
df$day_order_delivered_customer_date<-day(as.Date(df$order_delivered_customer_date))
df$year_order_estimated_delivery_date<-year(as.Date(df$order_estimated_delivery_date))
df$month_order_estimated_delivery_date<-month(as.Date(df$order_estimated_delivery_date))
df$day_order_estimated_delivery_date<-day(as.Date(df$order_estimated_delivery_date))
df$year_review_creation_date<-year(as.Date(df$review_creation_date))
df$month_review_creation_date<-month(as.Date(df$review_creation_date))
df$day_review_creation_date<-day(as.Date(df$review_creation_date))
df$year_review_answer_timestamp<-year(as.Date(df$review_answer_timestamp))
df$month_review_answer_timestamp<-month(as.Date(df$review_answer_timestamp))
df$day_review_answer_timestamp<-day(as.Date(df$review_answer_timestamp))
df$year_order_delivered_carrier_date<-year(as.Date(df$order_delivered_carrier_date))
df$month_order_delivered_carrier_date<-month(as.Date(df$order_delivered_carrier_date))
df$day_order_delivered_carrier_date<-day(as.Date(df$order_delivered_carrier_date))
df$year_shipping_limit_date<-year(as.Date(df$shipping_limit_date))
df$month_shipping_limit_date<-month(as.Date(df$shipping_limit_date))
df$day_shipping_limit_date<-day(as.Date(df$shipping_limit_date))
As shown below, more than 40% customers are from Sao Paulo(SP), which is much higher than other states. And the second one is Rio de Janeiro(RJ).
customer_state_counts <- sqldf("
SELECT customer_state, COUNT(customer_state) AS counts
FROM df
GROUP BY customer_state
ORDER BY counts DESC
")
# bar chart
customer_state_counts$percent <- round((customer_state_counts$counts/sum(customer_state_counts$counts))*100, 2)
options(repr.plot.width = 40, repr.plot.height =20)
ggplot(customer_state_counts, aes(x=reorder(customer_state, counts), y=counts)) +
geom_bar(stat='identity', fill = '#00BD1F', alpha=.8) +
geom_text(aes(label = paste0(percent, '%')), vjust = -0.9, size=2) +
xlab('State Name') +
ylab('Counts') +
ggtitle('Number of Customer State') +
theme_light() +
theme(axis.text = element_text(size = 10),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
plot.title = element_text(size = 20, face = 'bold'))
In the bar chart, almost 60% sellers are from Sao Paulo(SP), and the second one is Paraná(PR).
seller_state_counts <- sqldf("
SELECT seller_state,
COUNT(seller_state) AS counts
FROM df
GROUP BY seller_state
ORDER BY counts DESC
")
# bar chart
seller_state_counts$percent <- round((seller_state_counts$counts/sum(seller_state_counts$counts))*100, 2)
ggplot(seller_state_counts, aes(x=reorder(seller_state, counts), y=counts)) +
geom_bar(stat='identity', fill = '#4564FF', alpha=.8) +
geom_text(aes(label = paste0(percent, '%')), vjust = -0.9, size=2) +
xlab('State Name') +
ylab('Counts') +
ggtitle('Number of Seller State') +
theme_light() +
theme(axis.text = element_text(size = 10),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 15),
plot.title = element_text(size = 20, face = 'bold'))
There are many payment type currently, so make a donut chart to understand what is the most popular payment type in this ecommerce platform.
As you can see from the graph, credit cards are the most popular method of payment.
payment_type <- sqldf("
SELECT payment_type AS type,
COUNT(payment_type) AS counts,
ROUND(CAST(COUNT(payment_type) AS REAL)*100/CAST((SELECT COUNT(*) FROM df) AS REAL), 2) AS percent
FROM df
WHERE type != 'not_defined'
GROUP BY type
ORDER BY counts
")
# donut chart
type_eng <- c('Debit card', 'Voucher', 'Boleto(Ticket)', 'Credit card')
payment_type %>%
mutate(type_eng=type_eng) %>%
mutate(ymax=cumsum(percent/100)) %>%
mutate(ymin=c(0, head(ymax, n=-1))) %>%
mutate(labelPosition=(ymax+ymin)/2) %>%
mutate(label=paste0(percent, '%')) %>%
ggplot(aes(ymax=ymax, ymin=ymin, xmax=5, xmin=3, fill=type_eng)) +
geom_rect(color='white') +
geom_text(x=4, aes(y=labelPosition, label=label), size=3) +
coord_polar(theta = "y") +
xlim(c(-1, 5)) +
ggtitle('Customer Payment Type') +
scale_fill_brewer('Payment Type', palette="RdYlGn") +
theme_void() +
theme(legend.key.size = unit(2, 'cm'),
legend.title = element_text(size=10),
legend.text = element_text(size=10),
plot.title = element_text(size = 20, face = 'bold')) +
annotate(geom = 'text', x = -1, y = 0, size=5,
label = paste0('Total:\n', sum(payment_type$counts)))
In this part, we find different information about the orders dataframe, such as the number of orders each year or the delivery time of each order. In the donut chart, the largest part is 2018 and the second one is 2017 then 2016. So we can easily find the order number was continously growing up.
Orders$purchased <- as.Date(Orders$order_purchase_timestamp)
Orders$delivered <- as.Date(Orders$order_delivered_customer_date)
delivery <- sqldf("
SELECT order_status,
CAST(CAST(order_purchase_timestamp AS DATETIME) AS CHAR) AS year,
order_purchase_timestamp AS purchase_date,
order_delivered_customer_date AS delivered_date,
delivered-purchased AS delivered_days
FROM Orders
WHERE order_status = 'delivered'
AND delivered_days != 'NA'
ORDER BY delivered_days DESC
")
# statistics
delivery_stat <- sqldf("
SELECT MAX(delivered_days) AS max,
MIN(delivered_days) AS min,
AVG(delivered_days) AS mean,
MEDIAN(delivered_days) AS median,
STDEV(delivered_days) AS standard_deviation
FROM delivery
")
delivery_stat
## max min mean median standard_deviation
## 1 210 0 12.49685 10 9.555071
# donut chart
delivery %>%
select(year) %>%
count(year) %>%
mutate(ymax = cumsum(n/sum(n))) %>%
mutate(ymin = c(0, head(ymax, -1))) %>%
mutate(labelPosition = (ymax+ymin)/2) %>%
mutate(label = paste0(round(labelPosition*100, 2), '%')) %>%
ggplot(aes(ymax=ymax, ymin=ymin, xmax=5, xmin=3, fill=year)) +
geom_rect(color='white') +
geom_text(x=4, aes(y=labelPosition, label=label), size=3.5,
alpha=c(0, rep(1, 2))) +
geom_text(x=5.3, aes(y=labelPosition[1], label=label[1]), size=3.5) +
coord_polar(theta = "y") +
xlim(c(-1, 5)) +
ggtitle('Year of Orders') +
scale_fill_brewer('Year', palette="Set1") +
theme_void() +
theme(legend.key.size = unit(2, 'cm'),
legend.title = element_text(size=15),
legend.text = element_text(size=10),
plot.title = element_text(size = 20, face = 'bold')) +
annotate(geom = 'text', x = -1, y = 0, size=5,
label = paste0('Total:\n', dim(delivery)[1]))
Make a donut chart and bar chart to see the review score. Although most reviews are very high, the 1 score part is about 12%.
review_score <- sqldf("
SELECT CAST(review_score AS CHAR) AS score,
COUNT(review_score) AS counts,
ROUND(CAST(COUNT(review_score) AS REAL)*100/CAST((SELECT COUNT(*) FROM review) AS REAL), 2) AS percent
FROM review
GROUP BY score
ORDER BY counts
")
# donut chart
review_score %>%
mutate(ymax=cumsum(percent/100)) %>%
mutate(ymin=c(0, head(ymax, n=-1))) %>%
mutate(labelPosition=(ymax+ymin)/2) %>%
mutate(label=paste0(percent, '%')) %>%
ggplot(aes(ymax=ymax, ymin=ymin, xmax=5, xmin=3, fill=score)) +
geom_rect(color='white') +
geom_text(x=4, aes(y=labelPosition, label=label), size=3.5) +
coord_polar(theta = "y") +
xlim(c(-1, 5)) +
ggtitle('Customer Review Score') +
scale_fill_brewer('Score', palette="RdYlGn") +
theme_void() +
theme(legend.key.size = unit(2, 'cm'),
legend.title = element_text(size=10),
legend.text = element_text(size=10),
plot.title = element_text(size = 20, face = 'bold')) +
annotate(geom = 'text', x = -1, y = 0, size=10,
label = paste0('Total:\n', sum(review_score$counts)))
ggplot(review_score, aes(x=score, y=counts)) +
geom_bar(stat="identity", fill="#FFDD00", width=.4) +
geom_text(aes(label = paste0(percent, '%')), vjust = -0.5, hjust=0.4, size=4) +
ylim(0, 65000) +
xlab("Score") +
ylab("Counts") +
ggtitle("Percentage of Review Score from Customers") +
theme_light() +
theme(axis.text = element_text(size = 10),
axis.title.x = element_text(size = 13, vjust = -0.3),
axis.title.y = element_text(size = 13),
plot.title = element_text(size = 20, face = 'bold'))
The graph shows, in 2017, there has been a positive trend line in number of New Customers [Customer Unique Identity] getting registered with Olist. In 2018, more than 6000 were getting registered every month.
Orders1 <- mutate(Orders, Est.Lead.t = interval(ymd_hms(Orders$order_purchase_timestamp), ymd_hms(Orders$order_estimated_delivery_date)) %/% days(x = 1))
Orders1 <- mutate(Orders1, Act.Lead.t = interval(ymd_hms(Orders1$order_purchase_timestamp), ymd_hms(Orders$order_delivered_customer_date)) %/% days(x = 1))
Orders1 <- mutate(Orders1, Approval.mins = interval(ymd_hms(Orders1$order_purchase_timestamp), ymd_hms(Orders$order_approved_at)) %/% minutes(x = 1))
Orders1 <- mutate(Orders1, Appr.to.Car = interval(ymd_hms(Orders1$order_purchase_timestamp), ymd_hms(Orders$order_delivered_carrier_date)) %/% days(x = 1))
Orders1 <- mutate(Orders1, Car.to.Cus = interval(ymd_hms(Orders1$order_delivered_carrier_date), ymd_hms(Orders$order_delivered_customer_date)) %/% days(x = 1))
Orders1 <- mutate(Orders1, purchase_quart = quarter(Orders1$order_purchase_timestamp, with_year = TRUE))
Orders1 <- mutate(Orders1, purchase_month = month(Orders1$order_purchase_timestamp, label = TRUE, abbr = FALSE))
Orders1 <- mutate(Orders1, purchase_year = year(Orders1$order_purchase_timestamp))
Items1 <- mutate(Items, order.price = (order_item_id*price))
Items1 <- mutate(Items1, order.freight = (order_item_id*freight_value))
Items1 <- mutate(Items1, total.order.cost = (order.price + order.freight))
Items1 <- mutate(Items1, fr.sup.pr = if_else(freight_value >= price, 1, 0))
Cust.gr <- inner_join(Orders1, customer, by = "customer_id")
Cust.gr %>% select_at(vars(purchase_year, purchase_month, customer_unique_id)) %>% distinct_at(vars(purchase_year, purchase_month, customer_unique_id)) %>% arrange_at(vars(purchase_year, purchase_month)) %>% count_(vars(purchase_year, purchase_month)) %>% ggplot(aes(y = n, x = as.factor(purchase_month), label = n, color = factor(purchase_year)))+
geom_point(size = 5, color = 'navy', shape = 21, alpha = 0.54)+
geom_point(size = 3.75, color = 'navy', shape = 21, alpha = 0.72)+
geom_label(aes(fill = factor(purchase_year)), colour = "white", fontface = "bold", vjust = 1, size = 3.25)+
facet_grid(.~purchase_year)+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = c(0.1,0.7))+
labs(title = "Trend of new customer registration per month", x = "Months", y = "Number of Customers", fill = "Purchase Year")
## Warning: `count_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `count()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the dplyr package.
## Please report the issue at <]8;;https://github.com/tidyverse/dplyr/issueshttps://github.com/tidyverse/dplyr/issues]8;;>.
The diagram below illustrates that, if addition 50 New Product Categories per month,this will see a trend of customer growth. Consistency in this phenomenon was Olist’s byword for 20 months i.e. from January 2017 to August 2018.
Products <- left_join(products, nametrans, by = "product_category_name")
Items1 <- left_join(Items1, Products, by = "product_id")
Pdt.Gr <- inner_join(Orders1, Items1, by = "order_id")
Pdt.Gr %>% select_at(vars(purchase_year, purchase_month, product_category_name_english)) %>% distinct_at(vars(purchase_year, purchase_month, product_category_name_english)) %>% arrange_at(vars(purchase_year, purchase_month)) %>% count_(vars(purchase_year, purchase_month)) %>% ggplot(aes(x = factor(purchase_month), y = n))+
geom_point(color = "darkorchid", alpha = 0.72, shape = 21, aes(size = n), fill = alpha("snow2", 0.3))+
geom_point(color = "darkorchid", alpha = 0.81, shape = 21, aes(size = n*0.6), fill = alpha("snow2", 0.3))+
geom_segment(aes(x = purchase_month, xend = purchase_month, y = 0, yend = n), alpha = 0.45, linetype = 3, color = "darkorchid")+
coord_polar(start = 2)+
facet_wrap(.~purchase_year)+
theme_minimal()+
theme(axis.text = element_text(size = 8), panel.grid.major = element_line(), legend.position = "bottom")+
labs(title = "New Product categories introduced per month", x = "Months", y = "Number of Product Categories")
Analysis of the graph below shows that sellers too maintained the similar trend as that of customers, and within the time frame of this data, Olist ended up having +3095 registered sellers on it’s platform.
Seller.Gr <- inner_join(Orders1, left_join(Items1, sellers, by = "seller_id"), by = "order_id")
Seller.Gr %>% select_at(vars(purchase_year, purchase_month, seller_id)) %>% distinct_at(vars(purchase_year, purchase_month, seller_id)) %>% arrange_at(vars(purchase_year, purchase_month)) %>% count_(vars(purchase_year, purchase_month)) %>% ggplot(aes(x = factor(purchase_month), y = n))+
geom_point(color = "red", alpha = 0.63, shape = 21, aes(size = n), fill = alpha("cornsilk", 0.3))+
geom_point(color = "red", alpha = 0.81, shape = 21, aes(size = n*0.6), fill = alpha("cornsilk", 0.3))+
geom_segment(aes(x = purchase_month, xend = purchase_month, y = 0, yend = n), alpha = 0.45, linetype = 3, color = "red")+
coord_polar(start = 2)+
facet_wrap(.~purchase_year)+
theme_minimal()+
theme(axis.text = element_text(size = 8), panel.grid.major = element_line(), legend.position = "bottom")+
labs(title = "Increase in number of sellers per month", x = "Months", y = "Number of Sellers")
First, add the English category name into the product dataframe. And multiply the columns of length, width, and height as product volume.
index_no_translation <- which(!(products$product_category_name %in% nametrans$original_name) & products$product_category_name!='')
category_no_translation <- unique(products$product_category_name[index_no_translation])
nametrans[nrow(nametrans)+1, ] <- c('pc_gamer', 'pc_gamer')
nametrans[nrow(nametrans)+1, ] <- c('portateis_cozinha_e_preparadores_de_alimentos', 'portable_kitchen_and_food_preparation')
category <- sqldf("
SELECT df.product_category_name AS original_name,
nametrans.product_category_name_english,
df.product_weight_g AS weight,
product_length_cm*product_height_cm*product_width_cm AS volume
FROM df
LEFT JOIN nametrans
ON df.product_category_name = nametrans.product_category_name
WHERE df.product_category_name != ''
")
In the following, list the statestic tables people may be curious, such as which one category is the most popular in the Olist platform and which is most heaviest or largest.
most5 <- sqldf("
SELECT product_category_name_english,
COUNT(product_category_name_english) AS counts
FROM category
GROUP BY product_category_name_english
ORDER BY counts DESC
LIMIT 5
")
most5
## product_category_name_english counts
## 1 bed_bath_table 11847
## 2 health_beauty 9944
## 3 sports_leisure 8942
## 4 furniture_decor 8743
## 5 computers_accessories 8105
least5 <- sqldf("
SELECT product_category_name_english,
COUNT(product_category_name_english) AS counts
FROM category
GROUP BY product_category_name_english
ORDER BY counts
LIMIT 5
")
least5
## product_category_name_english counts
## 1 security_and_services 2
## 2 fashion_childrens_clothes 8
## 3 pc_gamer 10
## 4 cds_dvds_musicals 14
## 5 la_cuisine 15
heavy5 <- sqldf("
SELECT product_category_name_english,
AVG(weight) AS weight_group_mean
FROM category
GROUP BY product_category_name_english
ORDER BY weight_group_mean DESC
LIMIT 5
")
heavy5
## product_category_name_english weight_group_mean
## 1 office_furniture 11281.065
## 2 furniture_bedroom 10124.405
## 3 home_appliances_2 9888.052
## 4 kitchen_dining_laundry_garden_furniture 8670.963
## 5 furniture_mattress_and_upholstery 8446.341
light5 <- sqldf("
SELECT product_category_name_english,
AVG(weight) AS weight_group_mean
FROM category
GROUP BY product_category_name_english
ORDER BY weight_group_mean
LIMIT 5
")
light5
## product_category_name_english weight_group_mean
## 1 telephony 262.0009
## 2 fashion_childrens_clothes 265.0000
## 3 fashion_underwear_beach 269.7552
## 4 tablets_printing_image 292.4118
## 5 fashion_sport 335.4839
large5 <- sqldf("
SELECT product_category_name_english,
AVG(volume) AS volume_group_mean
FROM category
GROUP BY product_category_name_english
ORDER BY volume_group_mean DESC
LIMIT 5
")
large5
## product_category_name_english volume_group_mean
## 1 office_furniture 73454.07
## 2 furniture_mattress_and_upholstery 68050.59
## 3 home_appliances_2 53813.25
## 4 kitchen_dining_laundry_garden_furniture 51475.60
## 5 furniture_bedroom 47393.85
small5 <- sqldf("
SELECT product_category_name_english,
AVG(volume) AS volume_group_mean
FROM category
GROUP BY product_category_name_english
ORDER BY volume_group_mean
LIMIT 5
")
small5
## product_category_name_english volume_group_mean
## 1 telephony 1803.015
## 2 dvds_blu_ray 1889.743
## 3 books_imported 2076.500
## 4 watches_gifts 2834.818
## 5 books_technical 3174.119
Analysis by results, all Product Categories were purchased multiple times. The graph below shows which categories are high frequency categories.
Pdt.Gr %>% group_by_at(vars(purchase_year, purchase_month, product_category_name_english)) %>% filter(!is.na(product_category_name_english))%>% count(product_category_name_english) %>% arrange(desc(n)) %>% filter(!is.na(product_category_name_english) & n >1) %>% ggplot(aes(x = product_category_name_english, y = n))+
geom_boxplot(color = "blueviolet", alpha = 0.63)+
coord_flip()+
facet_wrap(.~reorder(purchase_year, desc(purchase_year)), scales = "free_x")+
theme_minimal()+
scale_y_reverse()+
theme(axis.text.y = element_text(size = 7.25), panel.grid.major.x = element_line(color = "white"), panel.grid.minor.x = element_line(color = "white"), panel.grid.major.y = element_line(color = "grey95", linetype = 3))+
labs(title = "Categories purchased at high frequency per year", x = "Product Categories", y = "Number of times Product Category purchased")
We would like to predict if an order will be delivered or not (classification problem) using Random Forest and predict the freight value(Regression problem) using linear regression
Here we will do data prepreocessing such as removing missing values , one hot encoding and choosing target variables
table(df$order_status)
##
## approved canceled delivered invoiced processing shipped
## 3 538 113232 358 357 1138
## unavailable
## 7
# target label preprocessing
df[which(df$order_status!='delivered'),]$order_status<-1
df[which(df$order_status=='delivered'),]$order_status<-2
df3<-select(df, -c('customer_id', 'customer_unique_id','order_id','order_item_id', 'product_id',
'seller_id','review_id','review_comment_title','review_comment_message','customer_city','customer_state',
'seller_city','seller_state','payment_type','order_purchase_timestamp',
'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date',
'order_estimated_delivery_date','review_creation_date',
'review_answer_timestamp','shipping_limit_date','product_category_name' ))
#back up
backup<- df
df<-backup
# one-hot encoding will conver categorical columns into numnerical
library('caret')
df_dummy<-select(df, c('customer_state', 'order_status', 'product_category_name', 'seller_city', 'seller_state', 'payment_type'))
dum<-dummyVars( ~ customer_state, data=df_dummy)
processed<-data.frame(predict(dum, newdata=df_dummy))
df_dummy<-cbind(df_dummy, processed)
dum<-dummyVars( ~ product_category_name, data=df_dummy)
processed<-data.frame(predict(dum, newdata=df_dummy))
df_dummy<-cbind(df_dummy, processed)
unique(df_dummy$seller_state)
## [1] "SP" "SC" "RJ" "PR" "MG" "ES" "RS" "DF" "PE" "MA" "BA" "GO" "MT" "MS" "RO"
## [16] "PB" "CE" "PA" "RN" "PI" "SE" "AC" "AM"
dum<-dummyVars( ~ seller_state, data=df_dummy)
processed<-data.frame(predict(dum, newdata=df_dummy))
df_dummy<-cbind(df_dummy, processed)
# df3 contains many null data from Date attributes, and most of them are label 1. So we cannot drop them
df3[is.na(df3)]<-0
# drop unnecessary attributeslst()# drop unnecessary attributes
df_dummy<-select(df_dummy, -c('customer_state', 'product_category_name', 'seller_city', 'seller_state', 'payment_type'))
lst<-cbind(df3, df_dummy)
Here we will perform correlational analysis . We would like to identify attributes that has impact on the target variable ‘order_status’
# drop missing value
na_smry<-summary(is.na(lst))
lst<-drop_na(lst, names(lst))
#
data_cor <-cor(lst[ , colnames(lst) !="order_status"], as.integer(lst$order_status))
x=data_cor >= abs(0.05)
sum(x, na.rm=TRUE)
## [1] 7
which(data_cor >= abs(0.05))
## [1] 15 22 23 24 25 26 27
# show all the correlated variables which is higher than 0.05
x[which(x==TRUE),]
## review_score year_order_delivered_carrier_date
## TRUE TRUE
## month_order_delivered_carrier_date day_order_delivered_carrier_date
## TRUE TRUE
## year_order_delivered_customer_date month_order_delivered_customer_date
## TRUE TRUE
## day_order_delivered_customer_date
## TRUE
# select all the correlated data
df4= select(df3,c('review_score','year_order_delivered_carrier_date', 'month_order_delivered_carrier_date',
'day_order_delivered_carrier_date', 'year_order_delivered_customer_date','month_order_delivered_customer_date',
'day_order_delivered_customer_date','order_status'))
# convert the target variable into factor
df4$order_status<- as.factor(df4$order_status)
here we will import the Random forest which will help us solve the classification problem ‘how to predit if an order will be delivered or not?’ we split the dataset into training and testing and fit the model into the dataset
# load those libraries
library(stats)
library(dplyr)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caTools)
# Splitting data in train and test data
split <- sample.split(df4, SplitRatio = 0.7)
train <- subset(df4, split == "TRUE")
test <- subset(df4, split == "FALSE")
# Fitting Random Forest to the train dataset
set.seed(120) # Setting seed
classifier_RF = randomForest(x = train[-8],
y = train$order_status,
ntree = 100)
The model has performed very well . The accuracy is very high 99.99% .This is because the dataset is non linear and RF performs well in such dataset
# Evaluate the model
status_pred = predict(classifier_RF, test) # will use RFM to predict Testing
test$status_pred = status_pred
# building the confusion matrix
CFM = table(test$order_status,test$status_pred)
CFM
##
## 1 2
## 1 947 4
## 2 5 42406
# show the accuracy for the classification
classification_Accuracy = sum(diag(CFM)/sum(CFM))
classification_Accuracy
## [1] 0.9997924
here we will predict the freight value of an order .We will use linear Regression to solve this problem . First we will perform correlational analysis to see the attributes that impact the target variable Freight value
#### 5.2.0 Correlational Analysis
Here we will perform correlational analysis . We would like to identify attributes that has impact on the target variable ‘freight_value’
# calculate the correlated attributes with freight value.
# freight value is the next target for the regression.
lst<-lst[ , colnames(lst) != "order_status"]
data_cor <-cor(lst, lst$freight_value)
x=data_cor >= abs(0.05)
sum(x, na.rm=TRUE)
## [1] 35
which(data_cor >= abs(0.05))
## [1] 1 2 3 5 7 8 9 10 11 13 14 41 44 45 49 52 53 54 55
## [20] 56 59 60 64 66 96 120 122 123 124 128 142 148 155 159 160
# show all the correlated attributes whihc is higher than 0.05
x[which(x==TRUE),]
## customer_zip_code_prefix
## TRUE
## price
## TRUE
## freight_value
## TRUE
## product_description_lenght
## TRUE
## product_weight_g
## TRUE
## product_length_cm
## TRUE
## product_height_cm
## TRUE
## product_width_cm
## TRUE
## seller_zip_code_prefix
## TRUE
## payment_installments
## TRUE
## payment_value
## TRUE
## customer_stateAL
## TRUE
## customer_stateBA
## TRUE
## customer_stateCE
## TRUE
## customer_stateMA
## TRUE
## customer_stateMT
## TRUE
## customer_statePA
## TRUE
## customer_statePB
## TRUE
## customer_statePE
## TRUE
## customer_statePI
## TRUE
## customer_stateRN
## TRUE
## customer_stateRO
## TRUE
## customer_stateSE
## TRUE
## customer_stateTO
## TRUE
## product_category_nameeletrodomesticos_2
## TRUE
## product_category_namemoveis_cozinha_area_de_servico_jantar_e_jardim
## TRUE
## product_category_namemoveis_escritorio
## TRUE
## product_category_namemoveis_quarto
## TRUE
## product_category_namemoveis_sala
## TRUE
## product_category_namepcs
## TRUE
## seller_stateBA
## TRUE
## seller_stateMG
## TRUE
## seller_statePR
## TRUE
## seller_stateRS
## TRUE
## seller_stateSC
## TRUE
# extract all the correlated attributes to generate a new data frame
df5<-lst[ , which(data_cor >= abs(0.05))]
here we will import the model , split the dataset and fit the model into the dataset
# fix a regression model and see it's R-Square to assess
# Splitting data in train and test data
split <- sample.split(df5, SplitRatio = 0.7)
train <- subset(df5, split == "TRUE")
test <- subset(df5, split == "FALSE")
set.seed(120) # Setting seed
lm_model<-lm(freight_value ~.,data=train)
summary(lm_model)$adj.r.squared
## [1] 0.5551996
Linear Regression has performed poorly on this dataset( MSE is 111 and Adjusted R squared = 0.54). This is because the dataset is non linear
# calculate the Mean Square Error of the prediction performance and the model itself
test$pred<-predict(lm_model, newdata=test)
MSE<-mean((test$freight_value - test$pred)^2)
print(MSE)
## [1] 116.9281
print(mean(lm_model$residuals^2))
## [1] 110.8917