1. Loading all the necessary libraries

library(tidyverse) # used for data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.2     âś” readr     2.1.4
## âś” forcats   1.0.0     âś” stringr   1.5.0
## âś” ggplot2   3.4.2     âś” tibble    3.2.1
## âś” lubridate 1.9.2     âś” tidyr     1.3.0
## âś” purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) # used for visualizations
library(lubridate) # used for dates

2. Importing the Dataset from CSV file

# Using read.csv command to load data
Pakistan.Largest.Ecommerce.Dataset <- read.csv("C:/Users/nazir ali khan/Downloads/Pakistan Largest Ecommerce Dataset.csv", stringsAsFactors=TRUE)

Loading this dataset into a new variable to keep the original copy in place:

# Loading this dataset into a new variable to keep the original copy in place:
ecom <- Pakistan.Largest.Ecommerce.Dataset

3. Checking Data Types and the range of the values

glimpse(ecom)
## Rows: 584,524
## Columns: 19
## $ item_id         <int> 211131, 211133, 211134, 211135, 211136, 211137, 211138…
## $ status          <fct> complete, canceled, canceled, complete, order_refunded…
## $ created_at      <fct> 7/1/2016, 7/1/2016, 7/1/2016, 7/1/2016, 7/1/2016, 7/1/…
## $ sku             <fct> "kreations_YI 06-L", "kcc_Buy 2 Frey Air Freshener & G…
## $ price           <dbl> 1950.00, 240.00, 2450.00, 360.00, 555.00, 80.00, 360.0…
## $ qty_ordered     <int> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ grand_total     <dbl> 1950.00, 240.00, 2450.00, 60.00, 1110.00, 80.00, 60.00…
## $ category_name   <fct> Women's Fashion, Beauty & Grooming, Women's Fashion, B…
## $ discount_amount <dbl> 0, 0, 0, 300, 0, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ payment_method  <fct> cod, cod, cod, cod, cod, cod, cod, cod, ublcreditcard,…
## $ Working.Date    <fct> 7/1/2016, 7/1/2016, 7/1/2016, 7/1/2016, 7/1/2016, 7/1/…
## $ BI.Status       <fct> Gross, Gross, Gross, Net, Valid, Gross, Net, Net, Gros…
## $ MV              <fct> "1,950", "240", "2,450", "360", "1,110", "80", "360", …
## $ Year            <int> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, …
## $ Month           <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ Customer.Since  <fct> 2016-7, 2016-7, 2016-7, 2016-7, 2016-7, 2016-7, 2016-7…
## $ M.Y             <fct> 16-Jul, 16-Jul, 16-Jul, 16-Jul, 16-Jul, 16-Jul, 16-Jul…
## $ FY              <fct> FY17, FY17, FY17, FY17, FY17, FY17, FY17, FY17, FY17, …
## $ Customer.ID     <fct> 1, 2, 3, 4, 5, 6, 7, 6, 8, 8, 9, 10, 10, 11, 12, 11, 1…

As we can see that R has treated order creation date, MV and working date as factors, we will have to change their data types.

4. Dealing with Data Types using Lubridate Package

ecom$Working.Date <- as.character(ecom$Working.Date)
ecom$created_at <- as.character(ecom$created_at)
ecom$Customer.Since <- as.character(ecom$Customer.Since)
ecom$MV <- as.numeric(ecom$MV)
ecom$Working.Date <- mdy(ecom$Working.Date)
ecom$created_at <- mdy(ecom$created_at)
ecom$Customer.Since <- ym(ecom$Customer.Since)
## Warning: 11 failed to parse.
# Checking new data types
class(ecom$MV)
## [1] "numeric"
class(ecom$Working.Date)
## [1] "Date"
class(ecom$created_at)
## [1] "Date"

5. Checking for any Missing Values

sum(is.na(ecom))
## [1] 11

Dataset has no missing values.

6. Exploratory Data Analysis

6.1 Number of Orders by Delivery Status

ecom %>% select(item_id, status) %>% 
  group_by(status) %>%
  summarise(Total_Orders = n()) %>% 
  arrange(desc(Total_Orders)) -> orders_by_status
print(orders_by_status)
## # A tibble: 16 Ă— 2
##    status         Total_Orders
##    <fct>                 <int>
##  1 complete             233685
##  2 canceled             201249
##  3 received              77290
##  4 order_refunded        59529
##  5 refund                 8050
##  6 cod                    2859
##  7 paid                   1159
##  8 closed                  494
##  9 payment_review           57
## 10 pending                  48
## 11 processing               33
## 12 holded                   31
## 13 unknown                  19
## 14 fraud                    10
## 15 pending_paypal            7
## 16 exchange                  4

6.1.1 Plot

 orders_by_status %>% ggplot(aes(Total_Orders, status)) +
  geom_bar(stat = "identity", fill = "#0D1282") +
  labs(title = "Orders by Delivery Status", x = "# of Orders", y = "Order Status")

6.2 Total Earnings by Category

ecom %>% select(category_name, grand_total) %>% 
  group_by(category_name) %>%
  summarise(Total_Amount_in_Lacs = sum(grand_total)/10^5) %>% 
  arrange(desc(Total_Amount_in_Lacs)) -> cat_total
# Rounding off values to 2 decimal places
cat_total$Total_Amount_in_Lacs <- round(cat_total$Total_Amount_in_Lacs,2)
print(cat_total)
## # A tibble: 16 Ă— 2
##    category_name      Total_Amount_in_Lacs
##    <fct>                             <dbl>
##  1 Mobiles & Tablets               24408. 
##  2 Appliances                       6568. 
##  3 Entertainment                    5390. 
##  4 Women's Fashion                  2826. 
##  5 Computing                        2025. 
##  6 Men's Fashion                    1941. 
##  7 Others                           1925. 
##  8 Superstore                       1121. 
##  9 Beauty & Grooming                 972. 
## 10 Home & Living                     882. 
## 11 Kids & Baby                       472. 
## 12 Soghaat                           463. 
## 13 Health & Sports                   418. 
## 14 unknown                           361. 
## 15 School & Education                 62.4
## 16 Books                              26.8

6.3 Average Order Value by Category

ecom %>% select(category_name, grand_total) %>% 
  group_by(category_name) %>%
  summarise(Avg_Order_Value = mean(grand_total)) -> cat_avg
cat_avg %>% arrange(desc(Avg_Order_Value)) -> cat_avg
cat_avg$Avg_Order_Value <- round(cat_avg$Avg_Order_Value,2)
print(cat_avg)
## # A tibble: 16 Ă— 2
##    category_name      Avg_Order_Value
##    <fct>                        <dbl>
##  1 Mobiles & Tablets           21094.
##  2 Entertainment               20476.
##  3 Computing                   12712.
##  4 Appliances                  12532.
##  5 Others                       6587.
##  6 Women's Fashion              4732.
##  7 unknown                      4506.
##  8 Home & Living                3328.
##  9 Kids & Baby                  2864.
## 10 Superstore                   2571.
## 11 Health & Sports              2388 
## 12 Beauty & Grooming            2342.
## 13 Men's Fashion                2105.
## 14 School & Education           1793.
## 15 Books                        1431.
## 16 Soghaat                      1363.

6.4 Number of Orders by Payment Method

ecom %>% select(payment_method, item_id) %>% 
  group_by(payment_method) %>%
  summarise(Total_Payments = n()) %>% 
  arrange(desc(Total_Payments)) -> Payments_Frequency
print(Payments_Frequency)
## # A tibble: 18 Ă— 2
##    payment_method    Total_Payments
##    <fct>                      <int>
##  1 cod                       271960
##  2 Payaxis                    97641
##  3 Easypay                    82900
##  4 jazzwallet                 35145
##  5 easypay_voucher            31176
##  6 bankalfalah                23065
##  7 jazzvoucher                15633
##  8 Easypay_MA                 14028
##  9 customercredit              7555
## 10 apg                         1758
## 11 ublcreditcard                882
## 12 cashatdoorstep               732
## 13 mcblite                      723
## 14 mygateway                    669
## 15 internetbanking              472
## 16 productcredit                125
## 17 marketingexpense              45
## 18 financesettlement             15

6.5 Average Discount by Top 10 Payment Methods

ecom %>% select(payment_method, discount_amount, item_id) %>% 
  group_by(payment_method) %>% 
  summarise(Avg_Discount = sum(discount_amount)/n()) %>% 
  arrange(desc(Avg_Discount)) %>% 
  head(10) -> Avg_Discount
Avg_Discount$Avg_Discount <- round(Avg_Discount$Avg_Discount,2)
print(Avg_Discount)
## # A tibble: 10 Ă— 2
##    payment_method  Avg_Discount
##    <fct>                  <dbl>
##  1 easypay_voucher       2196. 
##  2 jazzvoucher           1383. 
##  3 Payaxis                752. 
##  4 apg                    729. 
##  5 Easypay_MA             702. 
##  6 Easypay                680. 
##  7 jazzwallet             473. 
##  8 bankalfalah            393. 
##  9 cod                    128. 
## 10 customercredit          67.4

6.5.1 Plot

Avg_Discount %>% ggplot(aes(payment_method, Avg_Discount)) +
  geom_bar(stat = "identity", fill = "#0D1282" ) +
  coord_flip() + 
  labs(title = "Avg Discount by Payment Method", x = "Discount Amount", y = "Payment Method") 

6.6 Number of Customers by Month

ecom %>% select(Customer.Since, Year) %>% 
  group_by(Customer.Since) %>% 
  summarise(Total_Customers = n()) -> customers_by_month
print(customers_by_month)
## # A tibble: 27 Ă— 2
##    Customer.Since Total_Customers
##    <date>                   <int>
##  1 2016-07-01               57069
##  2 2016-08-01               21051
##  3 2016-09-01               46746
##  4 2016-10-01               17546
##  5 2016-11-01               82714
##  6 2016-12-01               10638
##  7 2017-01-01               12260
##  8 2017-02-01               10293
##  9 2017-03-01               18234
## 10 2017-04-01               16826
## # ℹ 17 more rows

6.6.1 Plot

customers_by_month %>% ggplot(aes(Customer.Since, Total_Customers)) +
  geom_point(color="#0D1282") +
  geom_line(color="#0D1282") +
  geom_smooth(method = lm, color = "#0D1282") +
  labs(title = "Customers by Year", x = "Year", y = "# of Customers") 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 row containing missing values (`geom_line()`).

6.7 Number of Orders by Year

ecom %>% select(Year, item_id) %>% 
  group_by(Year) %>% 
  summarise(Total_Order = n()) -> Orders_Per_Year

6.7.1 Plot

bar_chart <- Orders_Per_Year %>%ggplot(aes(Year, Total_Order)) +
  geom_bar(stat = "identity", fill = "#0D1282") +
  labs(title = "# of Order by Year", x = "Years" , y = "# of Orders")
print(bar_chart)

6.8 Sales by Fiscal Year

ecom %>% select(FY, grand_total) %>% 
  group_by(FY) %>% 
  summarise(Total_Sales_in_Arabs = sum(grand_total/10^9)) -> sales_year

6.8.1 Plot

sales_year %>% ggplot(aes(FY, Total_Sales_in_Arabs)) +
  geom_bar(fill="#0D1282", stat = "identity") + 
  labs(title = "Total Sales by Fiscal Year", x= "Fiscal Year", y = "Total Sales in Arab")

Thank You…..