library(dplyr) # for pipe operator
library(ggplot2) # for display of plots
library(readr) # to import datasets
library(lattice) # to display graphs
library(lubridate) # for operations on date and time variables
library(tidyr) # used to tidy data
library(forecast) # used for Box-Cox transformation
library(validate) # to use functions on dataframes
library(outliers)
First, the required packages are installed and loaded to use required function and operators for data pre-processing. Then the two datasets of intrigued are imported into R workspace to perform required operations.
Before carrying out the pre-processing task, the left join condition is used to merge the two datasets in order to gather the details of only those orders which has delivery status.
The rundown of combined dataset is displayed which shows statistical and general information of all the variables.
The desired variables are changed over into reasonable data type such as factor as required
Post datatype conversions, the NA values are determined and omitted to carry out necessary operations.
The dataset is in a UNTIDY format. The date and time are in the same variable,So we separated it.
Another new variable named Total_valueTopay is mutated to display the total cost of the product including delivey charges.
The outliers of the 3 numeric variables, price, freight_value and Total_valueTopay are checked by using boxplot. The outliers recognized are managed using the CAPPING method via a user-defined function which caps the values above 95percentile of the data.
We need to NORMALIZE the histogram displayed after removing the outliers which indicates the data is right skewed.
Finally, to reduce the skewness of the price and Total_price variables, data transformation using SQUARE ROOT METHOD is used.
The dataset named “Brazilian E-Commerce Public Dataset by Olist”, has over 100,000 Orders with reviews info,product and customer.
Amongst these whole dataset We have only considered 2 files as desired by the assignment requirements which are olist_orders_dataset and olist_order_item_dataset.
These two data files have the variables as Order_id, product_id and seller_id which are alphanumeric, order_item_id, price and freight_value which are numeric, order_status is character but will be converted into factor, order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date, and shipping_limit_date are date time variables.
library(readr)
dataset_1 <- read_csv("olist_order_items_dataset.csv")
Parsed with column specification:
cols(
order_id = [31mcol_character()[39m,
order_item_id = [32mcol_double()[39m,
product_id = [31mcol_character()[39m,
seller_id = [31mcol_character()[39m,
shipping_limit_date = [34mcol_datetime(format = "")[39m,
price = [32mcol_double()[39m,
freight_value = [32mcol_double()[39m
)
View(dataset_1)
library(readr)
dataset_2 <- read_csv("olist_orders_dataset.csv")
Parsed with column specification:
cols(
order_id = [31mcol_character()[39m,
customer_id = [31mcol_character()[39m,
order_status = [31mcol_character()[39m,
order_purchase_timestamp = [34mcol_datetime(format = "")[39m,
order_approved_at = [34mcol_datetime(format = "")[39m,
order_delivered_carrier_date = [34mcol_datetime(format = "")[39m,
order_delivered_customer_date = [34mcol_datetime(format = "")[39m,
order_estimated_delivery_date = [34mcol_datetime(format = "")[39m
)
View(dataset_2)
head(dataset_1)
NA
NA
head(dataset_2)
NA
NA
joined_data <- left_join(dataset_2, dataset_1, by = "order_id")
head(joined_data, 10)
NA
summary(joined_data)
order_id customer_id order_status order_purchase_timestamp
Length:113425 Length:113425 Length:113425 Min. :2016-09-04 21:15:19
Class :character Class :character Class :character 1st Qu.:2017-09-13 11:05:49
Mode :character Mode :character Mode :character Median :2018-01-19 10:37:45
Mean :2017-12-31 12:25:41
3rd Qu.:2018-05-04 14:22:16
Max. :2018-10-17 17:30:18
order_approved_at order_delivered_carrier_date order_delivered_customer_date
Min. :2016-09-15 12:16:38 Min. :2016-10-08 10:34:01 Min. :2016-10-11 13:46:32
1st Qu.:2017-09-13 20:25:10 1st Qu.:2017-09-18 20:37:00 1st Qu.:2017-09-26 20:09:44
Median :2018-01-19 16:59:52 Median :2018-01-24 18:44:33 Median :2018-02-02 20:57:23
Mean :2017-12-31 22:40:56 Mean :2018-01-05 02:17:27 Mean :2018-01-14 13:25:24
3rd Qu.:2018-05-04 18:31:38 3rd Qu.:2018-05-08 13:20:00 3rd Qu.:2018-05-15 20:09:21
Max. :2018-09-03 17:40:06 Max. :2018-09-11 19:48:28 Max. :2018-10-17 13:22:46
NA's :161 NA's :1968 NA's :3229
order_estimated_delivery_date order_item_id product_id seller_id
Min. :2016-09-30 00:00:00 Min. : 1.000 Length:113425 Length:113425
1st Qu.:2017-10-04 00:00:00 1st Qu.: 1.000 Class :character Class :character
Median :2018-02-15 00:00:00 Median : 1.000 Mode :character Mode :character
Mean :2018-01-24 08:22:34 Mean : 1.198
3rd Qu.:2018-05-25 00:00:00 3rd Qu.: 1.000
Max. :2018-11-12 00:00:00 Max. :21.000
NA's :775
shipping_limit_date price freight_value
Min. :2016-09-19 00:15:34 Min. : 0.85 Min. : 0.00
1st Qu.:2017-09-20 20:57:27 1st Qu.: 39.90 1st Qu.: 13.08
Median :2018-01-26 13:59:35 Median : 74.99 Median : 16.26
Mean :2018-01-07 15:36:52 Mean : 120.65 Mean : 19.99
3rd Qu.:2018-05-10 14:34:00 3rd Qu.: 134.90 3rd Qu.: 21.15
Max. :2020-04-09 22:35:08 Max. :6735.00 Max. :409.68
NA's :775 NA's :775 NA's :775
joined_data$order_status <- joined_data$order_status %>% factor(levels = c("unavailable", "created", "cancelled", "approved", "processing","invoiced", "shipped", "delivered"), ordered = TRUE)
class(joined_data$order_status) # Checking class of variable after conversion
[1] "ordered" "factor"
levels(joined_data$order_status) # levelling order_status variable
[1] "unavailable" "created" "cancelled" "approved" "processing" "invoiced" "shipped"
[8] "delivered"
str(joined_data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 113425 obs. of 14 variables:
$ order_id : chr "e481f51cbdc54678b7cc49136f2d6af7" "53cdb2fc8bc7dce0b6741e2150273451" "47770eb9100c2d0c44946d9cf07ec65d" "949d5b44dbf5de918fe9c16f97b45f8a" ...
$ customer_id : chr "9ef432eb6251297304e76186b10a928d" "b0830fb4747a6c6d20dea0b8c802d7ef" "41ce2a54c0b03bf3443c3d931a367089" "f88197465ea7920adcdbec7375364d82" ...
$ order_status : Ord.factor w/ 8 levels "unavailable"<..: 8 8 8 8 8 8 6 8 8 8 ...
$ order_purchase_timestamp : POSIXct, format: "2017-10-02 10:56:33" "2018-07-24 20:41:37" "2018-08-08 08:38:49" ...
$ order_approved_at : POSIXct, format: "2017-10-02 11:07:15" "2018-07-26 03:24:27" "2018-08-08 08:55:23" ...
$ order_delivered_carrier_date : POSIXct, format: "2017-10-04 19:55:00" "2018-07-26 14:31:00" "2018-08-08 13:50:00" ...
$ order_delivered_customer_date: POSIXct, format: "2017-10-10 21:25:13" "2018-08-07 15:27:45" "2018-08-17 18:06:29" ...
$ order_estimated_delivery_date: POSIXct, format: "2017-10-18" "2018-08-13" "2018-09-04" ...
$ order_item_id : num 1 1 1 1 1 1 1 1 1 1 ...
$ product_id : chr "87285b34884572647811a353c7ac498a" "595fac2a385ac33a80bd5114aec74eb8" "aa4383b373c6aca5d8797843e5594415" "d0b61bfb1de832b15ba9d266ca96e5b0" ...
$ seller_id : chr "3504c0cb71d7fa48d967e0e4c94d59d9" "289cdb325fb7e7f891c38608bf9e0962" "4869f7a5dfa277a7dca6462dcf3b52b2" "66922902710d126a0e7d26b0e3805106" ...
$ shipping_limit_date : POSIXct, format: "2017-10-06 11:07:15" "2018-07-30 03:24:27" "2018-08-13 08:55:23" ...
$ price : num 30 118.7 159.9 45 19.9 ...
$ freight_value : num 8.72 22.76 19.22 27.2 8.72 ...
tidy_data <- joined_data %>% separate (order_purchase_timestamp, into = c("purchase_Date", "purchase_Time"), sep = " ")
tidy_data <- tidy_data %>% separate (order_approved_at, into = c("approved_Date", "approved_Time"), sep = " ")
tidy_data <- tidy_data %>% separate (order_delivered_carrier_date, into = c("delivered_carrier_Date", "delivered_carrier_Time"), sep = " ")
tidy_data <- tidy_data %>% separate (order_delivered_customer_date, into = c("delivered_customer_Date", "delivered_customer_Time"), sep = " ")
tidy_data <- tidy_data %>% separate (shipping_limit_date, into = c("shipping_limit_date", "shipping_limit_Time"), sep = " ")
select(tidy_data,purchase_Date,purchase_Time,approved_Date,approved_Time,delivered_carrier_Date,delivered_carrier_Time,delivered_customer_Date,delivered_customer_Time,shipping_limit_date,shipping_limit_Time) %>% head()
NA
NA
mutated_data <- mutate(tidy_data, Total_valueTopay =(price) + (freight_value))
select(mutated_data,order_id,price,freight_value,Total_valueTopay) %>% head()
NA
NA
The missing values of dataset has a total of 17622.
we will remove these NA values.
colSums(is.na(mutated_data))
order_id customer_id order_status
0 0 706
purchase_Date purchase_Time approved_Date
0 0 161
approved_Time delivered_carrier_Date delivered_carrier_Time
161 1968 1968
delivered_customer_Date delivered_customer_Time order_estimated_delivery_date
3229 3229 0
order_item_id product_id seller_id
775 775 775
shipping_limit_date shipping_limit_Time price
775 775 775
freight_value Total_valueTopay
775 775
sum(is.na(mutated_data))
[1] 17622
scanned_data1 <- na.omit(mutated_data) # removing NA values and keeping only rows with complete cases
colSums(is.na(scanned_data1)) # Checking for removed NA values
order_id customer_id order_status
0 0 0
purchase_Date purchase_Time approved_Date
0 0 0
approved_Time delivered_carrier_Date delivered_carrier_Time
0 0 0
delivered_customer_Date delivered_customer_Time order_estimated_delivery_date
0 0 0
order_item_id product_id seller_id
0 0 0
shipping_limit_date shipping_limit_Time price
0 0 0
freight_value Total_valueTopay
0 0
sum(is.na(scanned_data1))
[1] 0
scanned_data1$freight_value %>% boxplot( main="Box Plot of Price", ylab="freight_value", col = "grey")
scanned_data1$Total_valueTopay %>% boxplot( main="Box Plot of Total_valueTopay", ylab="Price", col = "grey")
scanned_data1$price %>% boxplot( main="Box Plot of Price", ylab="Price", col = "grey")
scanned_data1$freight_value %>% boxplot( main="Box Plot of Price", ylab="freight_value", col = "grey")
scanned_data1$Total_valueTopay %>% boxplot( main="Box Plot of Total_valueTopay", ylab="Price", col = "grey")
NA
NA
NA
z.scores_price <- scanned_data1$price %>% scores(type = "z")
z.scores_price %>% summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.65342 -0.43923 -0.24726 0.00000 0.07784 36.28366
z.scores_freight <- scanned_data1$freight_value %>% scores(type = "z")
z.scores_freight %>% summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.2707 -0.4375 -0.2350 0.0000 0.0765 24.8244
z.scores_total <- scanned_data1$Total_valueTopay %>% scores(type = "z")
z.scores_total %>% summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.70694 -0.44766 -0.25250 0.00000 0.09265 35.85921
length (which( abs(z.scores_price) >3 ))
[1] 1930
length (which( abs(z.scores_freight) >3 ))
[1] 2004
length (which( abs(z.scores_total) >3 ))
[1] 1931
cap<- function (x){ ## Capping the dataset to handle outliers
quantiles <- quantile(x,c(.05, 0.25, 0.75, 0.95))
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[x > quantiles[3] + 1.5*IQR(x) ] <-quantiles[4]
x
}
new_price<- scanned_data1$price %>% cap()
new_freight_value<- scanned_data1$freight_value %>% cap()
Total_price<- scanned_data1$Total_valueTopay %>% cap()
boxplot(new_price, new_freight_value, Total_price, main = "Multiple boxplots without outliers", at = c(1,2,3),
names = c("price", "freight", "total"), col ="orange", border = "black")
*The new_price and total price histogram is plotted.
hist(new_price, xlab = "Price", main="Histogram after filtering Price <= 500")
grid()
hist(Total_price, xlab = "Total_price", main="Histogram after filtering Total_price <= 500")
grid()
NA
NA
squareroottransform <- sqrt(new_price)
hist(squareroottransform, xlab = "Square Root (Price)", main = "Histogram Plot for Normalised Data using Sqrt Transformation")
squareroottransform <- sqrt(Total_price)
hist(squareroottransform, xlab = "Square Root (Total_price)", main = "Histogram Plot for Normalised Data using Sqrt Transformation")