Required packages

library(dplyr)     # for pipe operator
library(ggplot2)   # for display of plots
library(readr)     # to import datasets
library(lattice)   # to display graphs
library(lubridate) # for operations on date and time variables
library(tidyr)     # used to tidy data
library(forecast)  # used for Box-Cox transformation
library(validate)  # to use functions on dataframes
library(outliers)

Executive Summary

Data

Importing and Reading the data set

library(readr)
dataset_1 <- read_csv("olist_order_items_dataset.csv")
Parsed with column specification:
cols(
  order_id = col_character(),
  order_item_id = col_double(),
  product_id = col_character(),
  seller_id = col_character(),
  shipping_limit_date = col_datetime(format = ""),
  price = col_double(),
  freight_value = col_double()
)
View(dataset_1)

library(readr)
dataset_2 <- read_csv("olist_orders_dataset.csv")
Parsed with column specification:
cols(
  order_id = col_character(),
  customer_id = col_character(),
  order_status = col_character(),
  order_purchase_timestamp = col_datetime(format = ""),
  order_approved_at = col_datetime(format = ""),
  order_delivered_carrier_date = col_datetime(format = ""),
  order_delivered_customer_date = col_datetime(format = ""),
  order_estimated_delivery_date = col_datetime(format = "")
)
View(dataset_2)

Order Items Dataset


head(dataset_1)
NA
NA

Overall Orders Dataset


head(dataset_2)
NA
NA

Joining Data set


joined_data <- left_join(dataset_2, dataset_1, by = "order_id")  
head(joined_data, 10)
NA

Understand

summary(joined_data)
   order_id         customer_id        order_status       order_purchase_timestamp     
 Length:113425      Length:113425      Length:113425      Min.   :2016-09-04 21:15:19  
 Class :character   Class :character   Class :character   1st Qu.:2017-09-13 11:05:49  
 Mode  :character   Mode  :character   Mode  :character   Median :2018-01-19 10:37:45  
                                                          Mean   :2017-12-31 12:25:41  
                                                          3rd Qu.:2018-05-04 14:22:16  
                                                          Max.   :2018-10-17 17:30:18  
                                                                                       
 order_approved_at             order_delivered_carrier_date  order_delivered_customer_date
 Min.   :2016-09-15 12:16:38   Min.   :2016-10-08 10:34:01   Min.   :2016-10-11 13:46:32  
 1st Qu.:2017-09-13 20:25:10   1st Qu.:2017-09-18 20:37:00   1st Qu.:2017-09-26 20:09:44  
 Median :2018-01-19 16:59:52   Median :2018-01-24 18:44:33   Median :2018-02-02 20:57:23  
 Mean   :2017-12-31 22:40:56   Mean   :2018-01-05 02:17:27   Mean   :2018-01-14 13:25:24  
 3rd Qu.:2018-05-04 18:31:38   3rd Qu.:2018-05-08 13:20:00   3rd Qu.:2018-05-15 20:09:21  
 Max.   :2018-09-03 17:40:06   Max.   :2018-09-11 19:48:28   Max.   :2018-10-17 13:22:46  
 NA's   :161                   NA's   :1968                  NA's   :3229                 
 order_estimated_delivery_date order_item_id     product_id         seller_id        
 Min.   :2016-09-30 00:00:00   Min.   : 1.000   Length:113425      Length:113425     
 1st Qu.:2017-10-04 00:00:00   1st Qu.: 1.000   Class :character   Class :character  
 Median :2018-02-15 00:00:00   Median : 1.000   Mode  :character   Mode  :character  
 Mean   :2018-01-24 08:22:34   Mean   : 1.198                                        
 3rd Qu.:2018-05-25 00:00:00   3rd Qu.: 1.000                                        
 Max.   :2018-11-12 00:00:00   Max.   :21.000                                        
                               NA's   :775                                           
 shipping_limit_date               price         freight_value   
 Min.   :2016-09-19 00:15:34   Min.   :   0.85   Min.   :  0.00  
 1st Qu.:2017-09-20 20:57:27   1st Qu.:  39.90   1st Qu.: 13.08  
 Median :2018-01-26 13:59:35   Median :  74.99   Median : 16.26  
 Mean   :2018-01-07 15:36:52   Mean   : 120.65   Mean   : 19.99  
 3rd Qu.:2018-05-10 14:34:00   3rd Qu.: 134.90   3rd Qu.: 21.15  
 Max.   :2020-04-09 22:35:08   Max.   :6735.00   Max.   :409.68  
 NA's   :775                   NA's   :775       NA's   :775     
joined_data$order_status <- joined_data$order_status %>% factor(levels = c("unavailable", "created", "cancelled", "approved", "processing","invoiced", "shipped", "delivered"), ordered = TRUE) 

class(joined_data$order_status) # Checking class of variable after conversion
[1] "ordered" "factor" 
levels(joined_data$order_status) # levelling order_status variable
[1] "unavailable" "created"     "cancelled"   "approved"    "processing"  "invoiced"    "shipped"    
[8] "delivered"  

str(joined_data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':    113425 obs. of  14 variables:
 $ order_id                     : chr  "e481f51cbdc54678b7cc49136f2d6af7" "53cdb2fc8bc7dce0b6741e2150273451" "47770eb9100c2d0c44946d9cf07ec65d" "949d5b44dbf5de918fe9c16f97b45f8a" ...
 $ customer_id                  : chr  "9ef432eb6251297304e76186b10a928d" "b0830fb4747a6c6d20dea0b8c802d7ef" "41ce2a54c0b03bf3443c3d931a367089" "f88197465ea7920adcdbec7375364d82" ...
 $ order_status                 : Ord.factor w/ 8 levels "unavailable"<..: 8 8 8 8 8 8 6 8 8 8 ...
 $ order_purchase_timestamp     : POSIXct, format: "2017-10-02 10:56:33" "2018-07-24 20:41:37" "2018-08-08 08:38:49" ...
 $ order_approved_at            : POSIXct, format: "2017-10-02 11:07:15" "2018-07-26 03:24:27" "2018-08-08 08:55:23" ...
 $ order_delivered_carrier_date : POSIXct, format: "2017-10-04 19:55:00" "2018-07-26 14:31:00" "2018-08-08 13:50:00" ...
 $ order_delivered_customer_date: POSIXct, format: "2017-10-10 21:25:13" "2018-08-07 15:27:45" "2018-08-17 18:06:29" ...
 $ order_estimated_delivery_date: POSIXct, format: "2017-10-18" "2018-08-13" "2018-09-04" ...
 $ order_item_id                : num  1 1 1 1 1 1 1 1 1 1 ...
 $ product_id                   : chr  "87285b34884572647811a353c7ac498a" "595fac2a385ac33a80bd5114aec74eb8" "aa4383b373c6aca5d8797843e5594415" "d0b61bfb1de832b15ba9d266ca96e5b0" ...
 $ seller_id                    : chr  "3504c0cb71d7fa48d967e0e4c94d59d9" "289cdb325fb7e7f891c38608bf9e0962" "4869f7a5dfa277a7dca6462dcf3b52b2" "66922902710d126a0e7d26b0e3805106" ...
 $ shipping_limit_date          : POSIXct, format: "2017-10-06 11:07:15" "2018-07-30 03:24:27" "2018-08-13 08:55:23" ...
 $ price                        : num  30 118.7 159.9 45 19.9 ...
 $ freight_value                : num  8.72 22.76 19.22 27.2 8.72 ...

Tidy & Manipulate Data I

tidy_data <- joined_data %>% separate (order_purchase_timestamp, into = c("purchase_Date", "purchase_Time"), sep = " ")

tidy_data <- tidy_data %>% separate (order_approved_at, into = c("approved_Date", "approved_Time"), sep = " ")

tidy_data <- tidy_data %>% separate (order_delivered_carrier_date, into = c("delivered_carrier_Date", "delivered_carrier_Time"), sep = " ")


tidy_data <- tidy_data %>% separate (order_delivered_customer_date, into = c("delivered_customer_Date", "delivered_customer_Time"), sep = " ")

tidy_data <- tidy_data %>% separate (shipping_limit_date, into = c("shipping_limit_date", "shipping_limit_Time"), sep = " ")


select(tidy_data,purchase_Date,purchase_Time,approved_Date,approved_Time,delivered_carrier_Date,delivered_carrier_Time,delivered_customer_Date,delivered_customer_Time,shipping_limit_date,shipping_limit_Time) %>% head()
NA
NA

Tidy & Manipulate Data II

mutated_data <- mutate(tidy_data, Total_valueTopay =(price) + (freight_value))
select(mutated_data,order_id,price,freight_value,Total_valueTopay) %>% head()
NA
NA

Scan I to check missing values

colSums(is.na(mutated_data))
                     order_id                   customer_id                  order_status 
                            0                             0                           706 
                purchase_Date                 purchase_Time                 approved_Date 
                            0                             0                           161 
                approved_Time        delivered_carrier_Date        delivered_carrier_Time 
                          161                          1968                          1968 
      delivered_customer_Date       delivered_customer_Time order_estimated_delivery_date 
                         3229                          3229                             0 
                order_item_id                    product_id                     seller_id 
                          775                           775                           775 
          shipping_limit_date           shipping_limit_Time                         price 
                          775                           775                           775 
                freight_value              Total_valueTopay 
                          775                           775 
sum(is.na(mutated_data))
[1] 17622
scanned_data1 <- na.omit(mutated_data) # removing NA values and keeping only rows with complete cases
colSums(is.na(scanned_data1)) # Checking for removed NA values
                     order_id                   customer_id                  order_status 
                            0                             0                             0 
                purchase_Date                 purchase_Time                 approved_Date 
                            0                             0                             0 
                approved_Time        delivered_carrier_Date        delivered_carrier_Time 
                            0                             0                             0 
      delivered_customer_Date       delivered_customer_Time order_estimated_delivery_date 
                            0                             0                             0 
                order_item_id                    product_id                     seller_id 
                            0                             0                             0 
          shipping_limit_date           shipping_limit_Time                         price 
                            0                             0                             0 
                freight_value              Total_valueTopay 
                            0                             0 
sum(is.na(scanned_data1))
[1] 0

Scan II

scanned_data1$freight_value %>%  boxplot( main="Box Plot of Price", ylab="freight_value", col = "grey")

scanned_data1$Total_valueTopay %>%  boxplot( main="Box Plot of Total_valueTopay", ylab="Price", col = "grey")





scanned_data1$price %>%  boxplot( main="Box Plot of Price", ylab="Price", col = "grey")

scanned_data1$freight_value %>%  boxplot( main="Box Plot of Price", ylab="freight_value", col = "grey")

scanned_data1$Total_valueTopay %>%  boxplot( main="Box Plot of Total_valueTopay", ylab="Price", col = "grey")

NA
NA
NA

Finding the z-score for numeric variables


z.scores_price <- scanned_data1$price %>%  scores(type = "z")
z.scores_price %>% summary()
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.65342 -0.43923 -0.24726  0.00000  0.07784 36.28366 
z.scores_freight <- scanned_data1$freight_value %>%  scores(type = "z")
z.scores_freight %>% summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-1.2707 -0.4375 -0.2350  0.0000  0.0765 24.8244 
z.scores_total <- scanned_data1$Total_valueTopay %>%  scores(type = "z")
z.scores_total %>% summary()
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.70694 -0.44766 -0.25250  0.00000  0.09265 35.85921 

Finding the total number of outliers according to the z-score


length (which( abs(z.scores_price) >3 ))
[1] 1930
length (which( abs(z.scores_freight) >3 ))
[1] 2004
length (which( abs(z.scores_total) >3 ))
[1] 1931

Handling outliers

  • To remove outliers, we are using capping method where the values above the upper limit are replaced by 95 percentile.
  • After removing outliers, the variables are assigned to new_price, new_freight_value and Total_price


cap<- function (x){         ## Capping the dataset to handle outliers
  quantiles <- quantile(x,c(.05, 0.25, 0.75, 0.95))
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[x > quantiles[3] + 1.5*IQR(x) ] <-quantiles[4]
  x
}

new_price<- scanned_data1$price %>% cap()
new_freight_value<- scanned_data1$freight_value %>% cap()
Total_price<- scanned_data1$Total_valueTopay %>% cap()



boxplot(new_price, new_freight_value, Total_price, main = "Multiple boxplots without outliers", at = c(1,2,3),
        names = c("price", "freight", "total"),  col ="orange", border = "black")

Before Transform

*The new_price and total price histogram is plotted.

  • The below histogram plot of filtered price variable (value less than 500) shows that the data is right skewed. Thus, appropriate measures are taken to reduce skewness of the dataset
hist(new_price, xlab = "Price", main="Histogram after filtering Price <= 500")
grid()



hist(Total_price, xlab = "Total_price", main="Histogram after filtering Total_price <= 500")
grid()

NA
NA

After Transformation


squareroottransform <- sqrt(new_price)
hist(squareroottransform, xlab = "Square Root (Price)", main = "Histogram Plot for Normalised Data using Sqrt Transformation")




squareroottransform <- sqrt(Total_price)
hist(squareroottransform, xlab = "Square Root (Total_price)", main = "Histogram Plot for Normalised Data using Sqrt Transformation")

Conclusion

  • Hence, by performing various operations on the datasets like filtering, mutation, datatype conversions, using capping functions etc., we conclude that the obtained dataset can now be used to interpret data and derive inferences from it as we have successfully pre-preprocessed the data for further analysis as and when required.
---
title: "MATH2349 Semester 2, 2019"
author: "Harsham Sunder (s3773161) , Cinita Mary Varghese (s3797635) "
subtitle: Assignment 3
output:
  html_notebook: default
---


## Required packages 



```{r}
library(dplyr)     # for pipe operator
library(ggplot2)   # for display of plots
library(readr)     # to import datasets
library(lattice)   # to display graphs
library(lubridate) # for operations on date and time variables
library(tidyr)     # used to tidy data
library(forecast)  # used for Box-Cox transformation
library(validate)  # to use functions on dataframes
library(outliers)

```


## Executive Summary 


* First, the required packages are installed and loaded to use required function and operators for data pre-processing. Then the two datasets of intrigued are  imported into R workspace to perform required operations.<br>

* Before carrying out the pre-processing task, the left join condition is used to merge the two datasets in order to gather the details of only those orders which has delivery status.<br>

* The rundown of combined dataset is displayed which shows statistical and general information of all the variables. <br>

* The desired variables are changed over into reasonable data type such as factor as required<br>

* Post datatype conversions, the NA values are determined and omitted to carry out necessary operations. <br>

* The dataset is in a UNTIDY format. The date and time are in the same variable,So we separated it.<br>

* Another new variable named Total_valueTopay is mutated to display the total cost of the product including delivey charges.<br>

* The outliers of the 3 numeric variables, price, freight_value and Total_valueTopay are  checked by using boxplot. The outliers recognized are managed using the CAPPING method via a user-defined function which caps the values above 95percentile of the data.<br>

* We need to NORMALIZE the histogram displayed after removing the outliers which indicates the data is right skewed.<br>

* Finally, to reduce the skewness of the price and Total_price variables, data transformation using SQUARE ROOT METHOD is used.<br>



## Data 

* The dataset named "Brazilian E-Commerce Public Dataset by Olist", has over 100,000 Orders with reviews info,product and customer.<br>

* Source : https://www.kaggle.com/olistbr/brazilian-ecommerce
<br>

* Amongst these whole dataset We have only considered 2 files as desired by the assignment requirements which are olist_orders_dataset and olist_order_item_dataset.<br>

* These two data files have the variables as Order_id, product_id and seller_id which are alphanumeric, order_item_id, price and freight_value which are numeric, order_status is character but will be converted into factor, order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date, and shipping_limit_date are date time variables.<br>


## Importing and Reading the data set


```{r}
library(readr)
dataset_1 <- read_csv("olist_order_items_dataset.csv")
View(dataset_1)

library(readr)
dataset_2 <- read_csv("olist_orders_dataset.csv")
View(dataset_2)



```


## Order Items Dataset

```{r}

head(dataset_1)


```

## Overall Orders Dataset


```{r}

head(dataset_2)


```

## Joining Data set

* Data is joined by using left_join function and newly formed dataset is named as joined_data.<br>
```{r}

joined_data <- left_join(dataset_2, dataset_1, by = "order_id")  
head(joined_data, 10)

```

## Understand 

* We are using summary() function to understand datatypes presented in the dataset.




```{r}
summary(joined_data)

```

*  The variable order_status is in character. It is then converted into factor and level is ordered.<br>

```{r}
joined_data$order_status <- joined_data$order_status %>% factor(levels = c("unavailable", "created", "cancelled", "approved", "processing","invoiced", "shipped", "delivered"), ordered = TRUE) 

class(joined_data$order_status) # Checking class of variable after conversion

levels(joined_data$order_status) # levelling order_status variable
```

* The structure of the joined Data is found out.<br>

```{r}

str(joined_data)

```




##	Tidy & Manipulate Data I 

* The variables order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date, and shipping_limit_date are in untidy format because date and time is in same cell. So it is separated to tidy the data.<br>
* Tidied data is then shown.<br>

```{r}
tidy_data <- joined_data %>% separate (order_purchase_timestamp, into = c("purchase_Date", "purchase_Time"), sep = " ")

tidy_data <- tidy_data %>% separate (order_approved_at, into = c("approved_Date", "approved_Time"), sep = " ")

tidy_data <- tidy_data %>% separate (order_delivered_carrier_date, into = c("delivered_carrier_Date", "delivered_carrier_Time"), sep = " ")


tidy_data <- tidy_data %>% separate (order_delivered_customer_date, into = c("delivered_customer_Date", "delivered_customer_Time"), sep = " ")

tidy_data <- tidy_data %>% separate (shipping_limit_date, into = c("shipping_limit_date", "shipping_limit_Time"), sep = " ")


select(tidy_data,purchase_Date,purchase_Time,approved_Date,approved_Time,delivered_carrier_Date,delivered_carrier_Time,delivered_customer_Date,delivered_customer_Time,shipping_limit_date,shipping_limit_Time) %>% head()


```

##	Tidy & Manipulate Data II 

* A new variable Total_valueTopay is made by adding price and freight values.

```{r}
mutated_data <- mutate(tidy_data, Total_valueTopay =(price) + (freight_value))
select(mutated_data,order_id,price,freight_value,Total_valueTopay) %>% head()


```


##	Scan I  to check missing values
 

* The missing values of dataset has a total of 17622.<br>

* we will remove these NA values.<br>



```{r}
colSums(is.na(mutated_data))
sum(is.na(mutated_data))

```


```{r}
scanned_data1 <- na.omit(mutated_data) # removing NA values and keeping only rows with complete cases
colSums(is.na(scanned_data1)) # Checking for removed NA values
sum(is.na(scanned_data1))


```

##	Scan II

* Based on below box plot, the TUKEY's Method of Outlier detection is used to detect the outliers  of the numeric variables price,freight_value and Total_valueTopay.


```{r}

scanned_data1$price %>%  boxplot( main="Box Plot of Price", ylab="Price", col = "grey")
scanned_data1$freight_value %>%  boxplot( main="Box Plot of Price", ylab="freight_value", col = "grey")
scanned_data1$Total_valueTopay %>%  boxplot( main="Box Plot of Total_valueTopay", ylab="Price", col = "grey")



```

# Finding the z-score for numeric variables

```{r}

z.scores_price <- scanned_data1$price %>%  scores(type = "z")
z.scores_price %>% summary()

z.scores_freight <- scanned_data1$freight_value %>%  scores(type = "z")
z.scores_freight %>% summary()

z.scores_total <- scanned_data1$Total_valueTopay %>%  scores(type = "z")
z.scores_total %>% summary()

```


# Finding the total number of outliers according to the z-score 


```{r}

length (which( abs(z.scores_price) >3 ))
length (which( abs(z.scores_freight) >3 ))
length (which( abs(z.scores_total) >3 ))


```


## Handling outliers

* To remove outliers, we are using capping method where the values above the upper limit are replaced by 95 percentile.<br>
* After removing outliers, the variables are assigned to new_price, new_freight_value and Total_price

```{r}


cap<- function (x){         ## Capping the dataset to handle outliers
  quantiles <- quantile(x,c(.05, 0.25, 0.75, 0.95))
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[x > quantiles[3] + 1.5*IQR(x) ] <-quantiles[4]
  x
}

new_price<- scanned_data1$price %>% cap()
new_freight_value<- scanned_data1$freight_value %>% cap()
Total_price<- scanned_data1$Total_valueTopay %>% cap()



boxplot(new_price, new_freight_value, Total_price, main = "Multiple boxplots without outliers", at = c(1,2,3),
        names = c("price", "freight", "total"),  col ="orange", border = "black")

```


## Before	Transform 

*The new_price and total price histogram is plotted.<br>

* The below histogram plot of filtered price variable (value less than 500) shows that the data is right skewed. Thus, appropriate measures are taken to reduce skewness of the dataset<br>

```{r}
hist(new_price, xlab = "Price", main="Histogram after filtering Price <= 500")
grid()


hist(Total_price, xlab = "Total_price", main="Histogram after filtering Total_price <= 500")
grid()


```


# After Transformation

* After trying several transformations such box-cox, log10, log, min max normalisation and z-score, we found Sqrt method as the most appropriate method for the transformation.


```{r}

squareroottransform <- sqrt(new_price)
hist(squareroottransform, xlab = "Square Root (Price)", main = "Histogram Plot for Normalised Data using Sqrt Transformation")



squareroottransform <- sqrt(Total_price)
hist(squareroottransform, xlab = "Square Root (Total_price)", main = "Histogram Plot for Normalised Data using Sqrt Transformation")
```

## Conclusion

* Hence, by performing various operations on the datasets like filtering, mutation, datatype conversions, using capping functions etc., we conclude that the obtained dataset can now be used to interpret data and derive inferences from it as we have successfully pre-preprocessed the data for further analysis as and when required.
<br>
