Required packages

The following packages were used during the course of this assignment.


library(readr)
library(lubridate)
library(dplyr)
library(tidyr)
library(magrittr)
library(MVN)

Executive Summary

Data preprocessing is a cruical step in data anaylsis. It prepares the data for further statistical analysis to obtain accurate results by ensuring that the data is clean and tidy. For this report, the following data preprocessing steps were taken:

Data

The two datasets used in the report are the Brazilian E-Commerce Datasets by Olist taken from:
Olist order Items Dataset: https://www.kaggle.com/olistbr/brazilian-ecommerce?select=olist_order_items_dataset.csv
Olist Sellers Dataset: https://www.kaggle.com/olistbr/brazilian-ecommerce?select=olist_sellers_dataset.csv

The accompanying datasets have data of orders made at Olist, the biggest retail chain in Brazilian commercial centers. Olist interfaces little organizations from all over Brazil to channels without trouble and with a solitary agreement. When a client buys from Olist, the dealer gets educated to satisfy that order.It comprises of 100k requests from 2016 to 2018 made at various commercial centers in Brazil.

The first dataset used is “Olist Order Items Dataset”. It provides data about items purchased within each order. This dataset consist of 112650 observations and the following 7 variables:

The second dataset used is “Olist Sellers Dataset”. This dataset incorporates data about the sellers that finished requests made at Olist. There are 3095 observations and 4 variables:

The two datasets were merged into one, by the common attribute ‘seller_id’ to show information about items in each order and its seller. The resultant dataset had 2 numeric variables, namely, ‘price’ and ‘freight_value’.


#Reading the Order Items Dataset
order_items<- read_csv("olist_order_items_dataset.csv")
Parsed with column specification:
cols(
  order_id = col_character(),
  order_item_id = col_double(),
  product_id = col_character(),
  seller_id = col_character(),
  shipping_limit_date = col_character(),
  price = col_double(),
  freight_value = col_double()
)
head(order_items)

#Reading the Sellers Dataset
seller<-read_csv("olist_sellers_dataset.csv")
Parsed with column specification:
cols(
  seller_id = col_character(),
  seller_zip_code_prefix = col_double(),
  seller_city = col_character(),
  seller_state = col_character()
)
head(seller)

#Merging the datasets Olist Order Items and Olist Sellers 
join<- order_items %>% left_join(seller, by= "seller_id")
head(join)
NA

Understand


#Checking dimension of 'join'
dim(join)
[1] 112650     10
#Subsetting with first 5000 observations
seller_order_list<- join[1:5000,]
dim(seller_order_list)
[1] 5000   10
str(seller_order_list)
tibble [5,000 x 10] (S3: tbl_df/tbl/data.frame)
 $ order_id              : chr [1:5000] "00010242fe8c5a6d1ba2dd792cb16214" "00018f77f2f0320c557190d7a144bdd3" "000229ec398224ef6ca0657da4fc703e" "00024acbcdf0a6daa1e931b038114c75" ...
 $ order_item_id         : num [1:5000] 1 1 1 1 1 1 1 1 1 1 ...
 $ product_id            : chr [1:5000] "4244733e06e7ecb4970a6e2683c13e61" "e5f2d52b802189ee658865ca93d83a8f" "c777355d18b72b67abbeef9df44fd0fd" "7634da152a4610f1595efa32f14722fc" ...
 $ seller_id             : chr [1:5000] "48436dade18ac8b2bce089ec2a041202" "dd7ddc04e1b6c2c614352b383efe2d36" "5b51032eddd242adc84c38acab88f23d" "9d7a1d34a5052409006425275ba1c2b4" ...
 $ shipping_limit_date   : chr [1:5000] "19-09-2017 09:45" "03-05-2017 11:05" "18-01-2018 14:48" "15-08-2018 10:10" ...
 $ price                 : num [1:5000] 58.9 239.9 199 13 199.9 ...
 $ freight_value         : num [1:5000] 13.3 19.9 17.9 12.8 18.1 ...
 $ seller_zip_code_prefix: num [1:5000] 27277 3471 37564 14403 87900 ...
 $ seller_city           : chr [1:5000] "volta redonda" "sao paulo" "borda da mata" "franca" ...
 $ seller_state          : chr [1:5000] "SP" "SP" "MG" "SP" ...
#Checking the data types of all 10 variables
class(seller_order_list$order_id)
[1] "character"
class(seller_order_list$order_item_id)
[1] "numeric"
class(seller_order_list$product_id)
[1] "character"
class(seller_order_list$seller_id)
[1] "character"
class(seller_order_list$shipping_limit_date)
[1] "character"
class(seller_order_list$price)
[1] "numeric"
class(seller_order_list$freight_value)
[1] "numeric"
class(seller_order_list$seller_zip_code_prefix)
[1] "numeric"
class(seller_order_list$seller_city)
[1] "character"
class(seller_order_list$seller_state)
[1] "character"
#Type conversions
#Converting 'seller_state' to a fctor variable
seller_order_list$seller_state<- factor(seller_order_list$seller_state, 
                                        levels = c("AC","AM","BA","CE","DF","ES","GO","MA","MG","MS","MT","PA","PB","PE","PI","PR","RJ","RN","RO","RS","SC","SE","SP"))
levels(seller_order_list$seller_state)
 [1] "AC" "AM" "BA" "CE" "DF" "ES" "GO" "MA" "MG" "MS" "MT" "PA" "PB" "PE" "PI"
[16] "PR" "RJ" "RN" "RO" "RS" "SC" "SE" "SP"
class(seller_order_list$seller_state)
[1] "factor"
#Converting 'seller_zip_code_prefix' to a character variable
seller_order_list$seller_zip_code_prefix<- as.character(seller_order_list$seller_zip_code_prefix)
class(seller_order_list$seller_zip_code_prefix)
[1] "character"

Tidy & Manipulate Data I


#Separating the 'shipping_limit_date' into two columns
seller_order_list<-separate(seller_order_list,shipping_limit_date, into = c("Shipping Date", "Shipping Time"), sep = " ")
seller_order_list %>% head(10)

#Converting 'Shipping Date' to date format
seller_order_list$`Shipping Date`<- format(as.Date(seller_order_list$`Shipping Date`, format = "%d-%m-%Y"), "%d-%m-%Y")
seller_order_list$`Shipping Date` %>% head(10)
 [1] "19-09-2017" "03-05-2017" "18-01-2018" "15-08-2018" "13-02-2017"
 [6] "23-05-2017" "14-12-2017" "10-07-2018" "26-03-2018" "06-07-2018"

Tidy & Manipulate Data II


#Mutating total price column
seller_order_list<- seller_order_list %>% group_by(order_id) %>%
  mutate(total_order_value= (price * length(order_id)+(freight_value * length(order_id))))

seller_order_list$total_order_value %>% head(10)
 [1]  72.19 259.83 216.87  25.78 218.04  34.59  31.75 880.75 157.60  65.39

Scan I


#Scanning for missing values
colSums(is.na(seller_order_list))
              order_id          order_item_id             product_id 
                     0                      0                      0 
             seller_id          Shipping Date          Shipping Time 
                     0                      0                      0 
                 price          freight_value seller_zip_code_prefix 
                     0                      0                      0 
           seller_city           seller_state      total_order_value 
                     0                      0                      0 
#Scanning for special values
sub<- seller_order_list %>% select(price, freight_value, total_order_value)
Adding missing grouping variables: `order_id`
seller_order_sub<- sub[,2:4]

special_values <- function(x){
  if (is.numeric(x)) (is.infinite(x) | is.nan(x))
}

sp_values<-sapply(seller_order_sub, FUN = special_values)
sapply(seller_order_sub, function(x){if (is.numeric(x)) sum(special_values(x))})
            price     freight_value total_order_value 
                0                 0                 0 
##Scanning for inconsistencies or obvious errors 
nonnegative<- function(x){x<0}
nonnegative_price<-sapply(seller_order_sub, FUN = nonnegative)
colSums(nonnegative_price)
            price     freight_value total_order_value 
                0                 0                 0 

Scan II


# Mahanabolis Outlier detection method
seller_order_outliers <- mvn(data = seller_order_sub, multivariateOutlierMethod = "quan", showOutliers = TRUE)
The covariance matrix has become singular during
the iterations of the MCD algorithm.
There are 3893 observations (in the entire dataset of 5000 obs.) lying
on the plane with equation 0.57735 (x_i1-m_1) + 0.57735 (x_i2-m_2) +
-0.57735 (x_i3-m_3) = 0 with (m_1,m_2) the mean of these observations.

#Checking locations of the outliers
seller_order_outliers$multivariateOutliers

#Capping
cap <- function(x){
  quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
  x
}

#Descriptive statistics of the subset
summary(seller_order_sub)
     price         freight_value    total_order_value 
 Min.   :   3.49   Min.   :  0.00   Min.   :   13.38  
 1st Qu.:  39.99   1st Qu.: 13.11   1st Qu.:   66.25  
 Median :  77.90   Median : 16.41   Median :  114.92  
 Mean   : 124.36   Mean   : 20.28   Mean   :  198.56  
 3rd Qu.: 138.00   3rd Qu.: 21.64   3rd Qu.:  191.67  
 Max.   :6735.00   Max.   :375.28   Max.   :13664.08  
#Applying cap() to the numeric variables
price_capped<- seller_order_list$price %>% cap()
freight_value_capped<- seller_order_list$freight_value %>% cap()
total_price_capped<- seller_order_list$total_order_value %>% cap()

#Checking the summary statistics again
seller_order_capped<- sapply(seller_order_sub, FUN = cap)
summary(seller_order_capped)
     price        freight_value   total_order_value
 Min.   :  3.49   Min.   : 1.09   Min.   : 13.38   
 1st Qu.: 39.99   1st Qu.:13.11   1st Qu.: 66.25   
 Median : 77.90   Median :16.41   Median :114.92   
 Mean   :104.66   Mean   :19.29   Mean   :157.57   
 3rd Qu.:138.00   3rd Qu.:21.64   3rd Qu.:191.67   
 Max.   :350.44   Max.   :46.41   Max.   :514.40   

Transform


# Checking the distribution of total_price_capped
hist(total_price_capped, main = "Histogram for total_order_value", xlab = "total_order_value")


#Applying natural log transformation
ln_total<- log(total_price_capped)
hist(ln_total,main = "Log Transformation of Total Price", xlab = "Total Price")


# Checking the distribution of price_capped
hist(price_capped, main = "Histogram for price", xlab = "Price of the order item")


#Applying natural log transformation
ln_price<- log(price_capped)
hist(ln_price,main = "Log Transformation of Price", xlab = "Price of the order item")


# Checking the distribution of freight_value_capped
hist(freight_value_capped, main = "Histogram for freight_value", xlab = "Freight value")


#Applying square root transformation
sqrt_total<- sqrt(freight_value_capped)
hist(sqrt_total, main = "Square Root Transformation of  freight_value", xlab = "Freight value")



---
title: "MATH2349 Semester 1, 2020"
author: "Anushka Sharma 3817004"
subtitle: Assignment 2
output:
  html_notebook: default
  html_document:
    df_print: paged
---

## Required packages 


The following packages were used during the course of this assignment.

```{r}

library(readr)
library(lubridate)
library(dplyr)
library(tidyr)
library(magrittr)
library(MVN)

```


## Executive Summary 

Data preprocessing is a cruical step in data anaylsis. It prepares the data for further statistical analysis to obtain accurate results by ensuring that the data is clean and tidy. For this report, the following data preprocessing steps were taken:

* The datasets were imported into R studio using readr library function and were merged based on the common attribute.
* Once imported,a subset of the original dataset was created for further operations that includes the frist 5000 observations. The dimensionality and structure of the dataset was checked along with understanding the semantics of the variables. Necessary type conversions were made in order to preserve the meaning of the variables.
* The datasets were examined to see if they were untidy and one of the dataset was messy as one of its columns comprised of multiple variables. The respective dataset was then tidied using separate() function.
* Once tidy, a new variable was added to the dataset using mutate() function that was created from the already existing variables.
* The next step was to scan the dataset for missing values along with special and inconsistent values. No such values were found in the dataset.
* After looking for missing values, presence of outliers in numeric variables were detected using MVN package and were dealt by capping or winsorising.
* Finally, distributions of the numeric variables were checked using histogram plots and appropriate transformations were applied to reduce the skewness.

## Data 

The two datasets used in the report are the Brazilian E-Commerce Datasets by Olist taken from:
<br>
Olist order Items Dataset: https://www.kaggle.com/olistbr/brazilian-ecommerce?select=olist_order_items_dataset.csv
<br>
Olist Sellers Dataset: 
https://www.kaggle.com/olistbr/brazilian-ecommerce?select=olist_sellers_dataset.csv
<br>

The accompanying datasets have data of orders made at Olist, the biggest retail chain in Brazilian commercial centers. Olist interfaces little organizations from all over Brazil to channels without trouble and with a solitary agreement. When a client buys from Olist, the dealer gets educated to satisfy that order.It comprises of 100k requests from 2016 to 2018 made at various commercial centers in Brazil. 
<br>

The first dataset used is "Olist Order Items Dataset". It provides data about items purchased within each order. This dataset consist of 112650 observations and the following 7 variables:

* Order_id : ID of order which is unique for each order

* Order_item_id : The number of items in a particular order, represented sequentially

* Product_id : ID of product which is unique for each product

* Seller_id : ID of seller which is unique for each seller

* Shipping_limit_date : Represents the shipping date and time by the seller to handle over the products to the logistic partners

* Price : Price of the item

* Freight_value : Freight value of item


The second dataset used is "Olist Sellers Dataset". This dataset incorporates data about the sellers that finished requests made at Olist. There are 3095 observations and 4 variables:

* Seller_id : Unique seller identifier

* Seller_zip_code_prefix : The first five digits of seller's zip code

* Seller_city : Seller's city name

* Seller_state : State of the Seller

The two datasets were merged into one, by the common attribute 'seller_id' to show information about items in each order and its seller. The resultant dataset had 2 numeric variables, namely, 'price' and 'freight_value'.

* The datasets were read in R studio using read_csv() function from 'readr' package.
* head() function was used to display initial observations of the datasets along with the header.
* left_join() function was used to join the two datasets by the variable 'seller_id' to display all the order items information along with their respective seller's details. It was then displayed using head() function.

```{r}

#Reading the Order Items Dataset
order_items<- read_csv("olist_order_items_dataset.csv")
head(order_items)

#Reading the Sellers Dataset
seller<-read_csv("olist_sellers_dataset.csv")
head(seller)

#Merging the datasets Olist Order Items and Olist Sellers 
join<- order_items %>% left_join(seller, by= "seller_id")
head(join)

```

## Understand 

* Once the datasets were merged, the dimensions was checked using dim() functions.
* The new dataset 'join' had 112650 observations and total of 10 variables. Since the number of observations were large and some packages like MVN, work with 5000 observations or less, a subset of the data was generated as 'seller_order_list' which consists of the first 5000 entries from the original merged dataset. 
* Structure of the subset was checked using str() function.
* Data types of all the 10 variables were checked with the help of class() function. The dataset consists of several character, numeric variables.
* The 'seller_state' variable was converted to a factor using factor() function and its levels were set.
* The 'seller_zip_code_prefix' variable was converted from numeric to character because ZIP code is not treated as a number that requires any computations.


```{r}

#Checking dimension of 'join'
dim(join)

#Subsetting with first 5000 observations
seller_order_list<- join[1:5000,]
dim(seller_order_list)
str(seller_order_list)

#Checking the data types of all 10 variables
class(seller_order_list$order_id)
class(seller_order_list$order_item_id)
class(seller_order_list$product_id)
class(seller_order_list$seller_id)
class(seller_order_list$shipping_limit_date)
class(seller_order_list$price)
class(seller_order_list$freight_value)
class(seller_order_list$seller_zip_code_prefix)
class(seller_order_list$seller_city)
class(seller_order_list$seller_state)

#Type conversions
#Converting 'seller_state' to a fctor variable
seller_order_list$seller_state<- factor(seller_order_list$seller_state, 
                                        levels = c("AC","AM","BA","CE","DF","ES","GO","MA","MG","MS","MT","PA","PB","PE","PI","PR","RJ","RN","RO","RS","SC","SE","SP"))
levels(seller_order_list$seller_state)
class(seller_order_list$seller_state)

#Converting 'seller_zip_code_prefix' to a character variable
seller_order_list$seller_zip_code_prefix<- as.character(seller_order_list$seller_zip_code_prefix)
class(seller_order_list$seller_zip_code_prefix)

```


##	Tidy & Manipulate Data I 

* The olist_order_items_dataset was messy because the variable 'shipping_limit_date' stores the date and time of shipping. In this case, mutliple variables are stored in one column and makes the dataset untidy.
* The 'shipping_limit_date' variable is then split into two columns, 'Shipping Date' and 'Shipping Time' using separate() function from tidyr package, by space as a separator. 
* Once separate, the 'Shipping Date' variable was then correctly converted as a date using Base R as.Date() function and was formatted as the original DD-MM-YYY format using format() function.


```{r}

#Separating the 'shipping_limit_date' into two columns
seller_order_list<-separate(seller_order_list,shipping_limit_date, into = c("Shipping Date", "Shipping Time"), sep = " ")
seller_order_list %>% head(10)

#Converting 'Shipping Date' to date format
seller_order_list$`Shipping Date`<- format(as.Date(seller_order_list$`Shipping Date`, format = "%d-%m-%Y"), "%d-%m-%Y")
seller_order_list$`Shipping Date` %>% head(10)

```

##	Tidy & Manipulate Data II 

* The olist_order_items_dataset consists details about each order with an order ID. It was observed that some orders had multiple items, where the price and freight value was split between the items. 
* 'order_item_id' is a sequential number identifying number of items included in the same order.
* A new variable 'total_order_value' was created that computes the total price of the order, i.e. Total order value = price + freight value.
* mutate() function was used to add the 'total_order_value' column to the data after grouping by the order_id and adding the price and freight value.
* The price and freight value for each order was calculated by multiplying the price and freight value with the number of items in each order using length(order_id).
* For example: The order with order_id = 00143d0f86d6fbd9f9b38ab440ac16f5 has 3 items. To get the total order value for each order just sum needs to be calculated as: 
    - The total order price value is: 21.33 * 3 = 63.99 
    - The total freight value is: 15.10 * 3 = 45.30
    - Therefore, the total order value (price + freight) is: 45.30 + 63.99 = 109.29


```{r}

#Mutating total price column
seller_order_list<- seller_order_list %>% group_by(order_id) %>%
  mutate(total_order_value= (price * length(order_id)+(freight_value * length(order_id))))

seller_order_list$total_order_value %>% head(10)

```


##	Scan I 

* The data was scanned for missing values using is.na() function and the total number of missing values in each column was checked using colSums() function. It was observed that there are no missing values in the used dataset since the total missing values for each column comes out to be 0.
* Every numerical column in the dataset, was then scanned for special values such as -Inf, Inf and NaN. 
    - Firstly, a subset of the numeric columns i.e. price, freight_value and total_order_value was created using select() function and then selecting just the numeric columns. This variable was named 'seller_order_sub'.
    - To check for special values, a function called special_values() was created.
    - Using sapply(), the special_values() function was applied to 'seller_order_sub'. The result was quite non-informative since it was long, therefore sum() was added together with special_values function to calculate the total special values for each column.
    - There were no special values observed in the numerical columns.
* The dataset represents price of various orders along with the freight value. In real world situation, these values cannot be negative. Therefore, each numeric price variables were checked for obvious inconsistency or error. 
    - A new function called nonnegative() was created that checks if the value are less than zero. It was then applied to the numeric values subset using sapply(). 
    - To check the total number of inconsistent values in each column, colSums() was used and it resulted in 0 inconsistent values for each column.


```{r}

#Scanning for missing values
colSums(is.na(seller_order_list))

#Scanning for special values
sub<- seller_order_list %>% select(price, freight_value, total_order_value)
seller_order_sub<- sub[,2:4]

special_values <- function(x){
  if (is.numeric(x)) (is.infinite(x) | is.nan(x))
}

sp_values<-sapply(seller_order_sub, FUN = special_values)
sapply(seller_order_sub, function(x){if (is.numeric(x)) sum(special_values(x))})

##Scanning for inconsistencies or obvious errors 
nonnegative<- function(x){x<0}
nonnegative_price<-sapply(seller_order_sub, FUN = nonnegative)
colSums(nonnegative_price)

```


##	Scan II

* The dataset was now scanned for possible outliers. The data contained three numeric variables and with the help of the Mahalanobis distance method the outliers were detected for multivariate setting. 
* Using the mvn() function on 'seller_order_sub' from the MVN package, multivariate outliers were detected with QQ plot. The showOutliers = TRUE argument depicts the multivariate outliers and show them on QQ plot.
* The QQ plot suggests existence of 68 outliers for this subset. 
* The location of outliers in the dataset were detected using seller_order_outliers$multivariateOutliers.
* Once the presence of outliers in the subset of dataset was detected, the next step was to handle them.
* Capping or winsorising was performed in order to deal with the outliers, instead of removing them. This is because the outliers in this case may not necessarily mean that the values reported are due to some errors. Sometimes outliers can be legitimate observations. A high total order value can either mean the number of items in the order are large or the item price along with freight value are high. 
* cap() function was created to replace those data points lying outside the lower limit and above the upper limit with the value of 5th percentile and 95th percentile.
* Before capping, descriptive statistics of the subset was produced using summary().
* Each numeric variable was then capped using cap() function and the results were stored in the variables 'price_capped', 'freight_value_capped' and 'total_price_capped'.
* Summary statistics of the capped subset was computed using summary().


```{r}

# Mahanabolis Outlier detection method
seller_order_outliers <- mvn(data = seller_order_sub, multivariateOutlierMethod = "quan", showOutliers = TRUE)

#Checking locations of the outliers
seller_order_outliers$multivariateOutliers

#Capping
cap <- function(x){
  quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
  x
}

#Descriptive statistics of the subset
summary(seller_order_sub)

#Applying cap() to the numeric variables
price_capped<- seller_order_list$price %>% cap()
freight_value_capped<- seller_order_list$freight_value %>% cap()
total_price_capped<- seller_order_list$total_order_value %>% cap()

#Checking the summary statistics again
seller_order_capped<- sapply(seller_order_sub, FUN = cap)
summary(seller_order_capped)

```


##	Transform 

* The 'total_price_capped', 'price_capped' represents the total_order_value and price after capping the outliers. On producing a histogram for both the variables, it was observed that they had right-skewed distributions.
* Several methods, for instance, log10 transformation, square root transformation, reciprocal transformation and BoxCox transformation was applied to reduce the skewness but the best results were obtained using natural logarithm transformation.
* Since the variables had no zero or negative values, log() function was used to tranform the 'total_price_capped' and 'price_capped' variable to produce a more symmetrical distribution.
* The 'freight_value_capped' variable is observed to be mildly right skewed after checking its histogram.
* To reduce the skeweness, several transformation techniques were applied and the best results were obtained by using square root transformation, using sqrt() function.
* Histograms of all three variables were plot to check the transformation results using hist() function.


```{r}

# Checking the distribution of total_price_capped
hist(total_price_capped, main = "Histogram for total_order_value", xlab = "total_order_value")

#Applying natural log transformation
ln_total<- log(total_price_capped)
hist(ln_total,main = "Log Transformation of Total Price", xlab = "Total Price")

# Checking the distribution of price_capped
hist(price_capped, main = "Histogram for price", xlab = "Price of the order item")

#Applying natural log transformation
ln_price<- log(price_capped)
hist(ln_price,main = "Log Transformation of Price", xlab = "Price of the order item")

# Checking the distribution of freight_value_capped
hist(freight_value_capped, main = "Histogram for freight_value", xlab = "Freight value")

#Applying square root transformation
sqrt_total<- sqrt(freight_value_capped)
hist(sqrt_total, main = "Square Root Transformation of  freight_value", xlab = "Freight value")

```

<br>
<br>
