Importing related libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## <U+221A> ggplot2 3.3.2     <U+221A> purrr   0.3.4
## <U+221A> tibble  3.0.4     <U+221A> stringr 1.4.0
## <U+221A> tidyr   1.1.2     <U+221A> forcats 0.5.0
## <U+221A> readr   1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Importing dataset

olist_customers_dataset = read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/olist_customers_dataset.csv")
olist_geolocation_dataset = read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/olist_geolocation_dataset.csv")
olist_order_items_dataset = read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/olist_order_items_dataset.csv")
olist_order_payments_dataset= read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/olist_order_payments_dataset.csv")
olist_order_reviews_dataset= read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/olist_order_reviews_dataset.csv")
olist_orders_dataset=read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/olist_orders_dataset.csv")
olist_products_dataset=read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/olist_products_dataset.csv")
olist_sellers_dataset= read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/olist_sellers_dataset.csv")
product_category_name_translation = read.csv("C:/Users/Admin/Downloads/brazilan e-commerce/product_category_name_translation.csv")

Deleting Null values from ‘order_approved_at’ & ’order_delivered_carried_date’columns

olist_orders_dataset=subset( olist_orders_dataset,order_approved_at>''& order_delivered_carrier_date>'')

Extracting orders with invalid date: These orders have illogical date which the delivered customer date is more than the order delivered carrier date that means the customer received the shipment before it has been shipped!

wrong_shipping_data=olist_orders_dataset[olist_orders_dataset$order_delivered_customer_date<olist_orders_dataset$order_delivered_carrier_date 
                                         & olist_orders_dataset$order_delivered_customer_date>"",]

Exploring the wrong orders

View(wrong_shipping_data)

The dataset after wrong shipping date being removed

olist_orders_dataset_corrected= (olist_orders_dataset
                                 [!(olist_orders_dataset$order_id) %in% 
                                     (wrong_shipping_data$order_id),])

Exploring the dataset

View(olist_orders_dataset_corrected)

Excluding Null values from the ‘delivered_customer_date’ column in order to measure the median of the shipping time to fill them into null values again

olist_orders_dataset_without_null= olist_orders_dataset_corrected[olist_orders_dataset_corrected$order_delivered_customer_date>'',]
df=olist_orders_dataset_without_null

Calculating the median of the shipping time in Seconds

diffirent_time_in_second_after_correction=
  median(as.POSIXct(df$order_delivered_customer_date)-
           as.POSIXct(df$order_delivered_carrier_date))
diffirent_time_in_second_after_correction
## Time difference of 613453.5 secs

Calculating the median of the shipping time in Minutes

m_after_correction=diffirent_time_in_second_after_correction/60

Calculating the median of the shipping time in Minutes with no string

m_after_correction=unlist(m_after_correction,'\\s+')[[1]]
m_after_correction
## [1] 10224.23

Calculating the median of the shipping time in Hours

h_after_correction=m_after_correction/60
h_after_correction
## [1] 170.4038

Calculating the median of the shipping time in Days to use it in replacing Null values at the column

d_after_correction=(round(h_after_correction/24))
d_after_correction
## [1] 7

Finding the maximum shipping time in Seconds

max_time_after_correction=max(as.POSIXct(df$order_delivered_customer_date)-
                                as.POSIXct(df$order_delivered_carrier_date))
max_time_after_correction
## Time difference of 17728500 secs

Finding the minimum shipping time in Seconds, this should be (0), otherwise it is a wrong value

min_time_after_correction=min(as.POSIXct(df$order_delivered_customer_date)-
                                as.POSIXct(df$order_delivered_carrier_date))
min_time_after_correction
## Time difference of 0 secs

Filling the Null values in the ‘order_delivered_customer_date’ column with the suitable value, the method is that we measured the median of the shipping time value and then add it to the shipping date and time to anticipate the receiving date and time

for (i in 1:length(olist_orders_dataset_corrected$order_id)){
  if (olist_orders_dataset_corrected[i,7]==''){
    olist_orders_dataset_corrected[i,7]= 
      as.character.Date(ymd_hms(olist_orders_dataset_corrected[i,6])+days(d_after_correction))
  }
}

Exploring the final cleaned dataset

View(olist_orders_dataset_corrected)

#Fixing olist_order_reviews_dataset

In this dataset we have numbers of Null values in both ‘review_comment_title’ and ‘review_comment_message’ columns

Filling Null values in both columns

olist_order_reviews_dataset$
  review_comment_title[olist_order_reviews_dataset$review_comment_title=='' 
                       | olist_order_reviews_dataset$review_comment_title==' ' ]= 'comment without title'
olist_order_reviews_dataset$
  review_comment_message[olist_order_reviews_dataset$review_comment_message==''
                         | olist_order_reviews_dataset$review_comment_message==' ' ]='comment without message'

Exploring the dataset

View(olist_order_reviews_dataset)

# Fixing olist_products_dataset

olist_products_dataset=na.omit(olist_products_dataset)

Exploring the dataset

view(olist_products_dataset)