library(tidyverse) # used for data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.2 âś” readr 2.1.4
## âś” forcats 1.0.0 âś” stringr 1.5.0
## âś” ggplot2 3.4.2 âś” tibble 3.2.1
## âś” lubridate 1.9.2 âś” tidyr 1.3.0
## âś” purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) # used for visualizations
library(lubridate) # used for dates
# Using read.csv command to load data
Pakistan.Largest.Ecommerce.Dataset <- read.csv("C:/Users/nazir ali khan/Downloads/Pakistan Largest Ecommerce Dataset.csv", stringsAsFactors=TRUE)
Loading this dataset into a new variable to keep the original copy in place:
# Loading this dataset into a new variable to keep the original copy in place:
ecom <- Pakistan.Largest.Ecommerce.Dataset
glimpse(ecom)
## Rows: 584,524
## Columns: 19
## $ item_id <int> 211131, 211133, 211134, 211135, 211136, 211137, 211138…
## $ status <fct> complete, canceled, canceled, complete, order_refunded…
## $ created_at <fct> 7/1/2016, 7/1/2016, 7/1/2016, 7/1/2016, 7/1/2016, 7/1/…
## $ sku <fct> "kreations_YI 06-L", "kcc_Buy 2 Frey Air Freshener & G…
## $ price <dbl> 1950.00, 240.00, 2450.00, 360.00, 555.00, 80.00, 360.0…
## $ qty_ordered <int> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ grand_total <dbl> 1950.00, 240.00, 2450.00, 60.00, 1110.00, 80.00, 60.00…
## $ category_name <fct> Women's Fashion, Beauty & Grooming, Women's Fashion, B…
## $ discount_amount <dbl> 0, 0, 0, 300, 0, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ payment_method <fct> cod, cod, cod, cod, cod, cod, cod, cod, ublcreditcard,…
## $ Working.Date <fct> 7/1/2016, 7/1/2016, 7/1/2016, 7/1/2016, 7/1/2016, 7/1/…
## $ BI.Status <fct> Gross, Gross, Gross, Net, Valid, Gross, Net, Net, Gros…
## $ MV <fct> "1,950", "240", "2,450", "360", "1,110", "80", "360", …
## $ Year <int> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, …
## $ Month <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ Customer.Since <fct> 2016-7, 2016-7, 2016-7, 2016-7, 2016-7, 2016-7, 2016-7…
## $ M.Y <fct> 16-Jul, 16-Jul, 16-Jul, 16-Jul, 16-Jul, 16-Jul, 16-Jul…
## $ FY <fct> FY17, FY17, FY17, FY17, FY17, FY17, FY17, FY17, FY17, …
## $ Customer.ID <fct> 1, 2, 3, 4, 5, 6, 7, 6, 8, 8, 9, 10, 10, 11, 12, 11, 1…
As we can see that R has treated order creation date, MV and working date as factors, we will have to change their data types.
ecom$Working.Date <- as.character(ecom$Working.Date)
ecom$created_at <- as.character(ecom$created_at)
ecom$Customer.Since <- as.character(ecom$Customer.Since)
ecom$MV <- as.numeric(ecom$MV)
ecom$Working.Date <- mdy(ecom$Working.Date)
ecom$created_at <- mdy(ecom$created_at)
ecom$Customer.Since <- ym(ecom$Customer.Since)
## Warning: 11 failed to parse.
# Checking new data types
class(ecom$MV)
## [1] "numeric"
class(ecom$Working.Date)
## [1] "Date"
class(ecom$created_at)
## [1] "Date"
sum(is.na(ecom))
## [1] 11
Dataset has no missing values.
ecom %>% select(item_id, status) %>%
group_by(status) %>%
summarise(Total_Orders = n()) %>%
arrange(desc(Total_Orders)) -> orders_by_status
print(orders_by_status)
## # A tibble: 16 Ă— 2
## status Total_Orders
## <fct> <int>
## 1 complete 233685
## 2 canceled 201249
## 3 received 77290
## 4 order_refunded 59529
## 5 refund 8050
## 6 cod 2859
## 7 paid 1159
## 8 closed 494
## 9 payment_review 57
## 10 pending 48
## 11 processing 33
## 12 holded 31
## 13 unknown 19
## 14 fraud 10
## 15 pending_paypal 7
## 16 exchange 4
orders_by_status %>% ggplot(aes(Total_Orders, status)) +
geom_bar(stat = "identity", fill = "#0D1282") +
labs(title = "Orders by Delivery Status", x = "# of Orders", y = "Order Status")
ecom %>% select(category_name, grand_total) %>%
group_by(category_name) %>%
summarise(Total_Amount_in_Lacs = sum(grand_total)/10^5) %>%
arrange(desc(Total_Amount_in_Lacs)) -> cat_total
# Rounding off values to 2 decimal places
cat_total$Total_Amount_in_Lacs <- round(cat_total$Total_Amount_in_Lacs,2)
print(cat_total)
## # A tibble: 16 Ă— 2
## category_name Total_Amount_in_Lacs
## <fct> <dbl>
## 1 Mobiles & Tablets 24408.
## 2 Appliances 6568.
## 3 Entertainment 5390.
## 4 Women's Fashion 2826.
## 5 Computing 2025.
## 6 Men's Fashion 1941.
## 7 Others 1925.
## 8 Superstore 1121.
## 9 Beauty & Grooming 972.
## 10 Home & Living 882.
## 11 Kids & Baby 472.
## 12 Soghaat 463.
## 13 Health & Sports 418.
## 14 unknown 361.
## 15 School & Education 62.4
## 16 Books 26.8
ecom %>% select(category_name, grand_total) %>%
group_by(category_name) %>%
summarise(Avg_Order_Value = mean(grand_total)) -> cat_avg
cat_avg %>% arrange(desc(Avg_Order_Value)) -> cat_avg
cat_avg$Avg_Order_Value <- round(cat_avg$Avg_Order_Value,2)
print(cat_avg)
## # A tibble: 16 Ă— 2
## category_name Avg_Order_Value
## <fct> <dbl>
## 1 Mobiles & Tablets 21094.
## 2 Entertainment 20476.
## 3 Computing 12712.
## 4 Appliances 12532.
## 5 Others 6587.
## 6 Women's Fashion 4732.
## 7 unknown 4506.
## 8 Home & Living 3328.
## 9 Kids & Baby 2864.
## 10 Superstore 2571.
## 11 Health & Sports 2388
## 12 Beauty & Grooming 2342.
## 13 Men's Fashion 2105.
## 14 School & Education 1793.
## 15 Books 1431.
## 16 Soghaat 1363.
ecom %>% select(payment_method, item_id) %>%
group_by(payment_method) %>%
summarise(Total_Payments = n()) %>%
arrange(desc(Total_Payments)) -> Payments_Frequency
print(Payments_Frequency)
## # A tibble: 18 Ă— 2
## payment_method Total_Payments
## <fct> <int>
## 1 cod 271960
## 2 Payaxis 97641
## 3 Easypay 82900
## 4 jazzwallet 35145
## 5 easypay_voucher 31176
## 6 bankalfalah 23065
## 7 jazzvoucher 15633
## 8 Easypay_MA 14028
## 9 customercredit 7555
## 10 apg 1758
## 11 ublcreditcard 882
## 12 cashatdoorstep 732
## 13 mcblite 723
## 14 mygateway 669
## 15 internetbanking 472
## 16 productcredit 125
## 17 marketingexpense 45
## 18 financesettlement 15
ecom %>% select(payment_method, discount_amount, item_id) %>%
group_by(payment_method) %>%
summarise(Avg_Discount = sum(discount_amount)/n()) %>%
arrange(desc(Avg_Discount)) %>%
head(10) -> Avg_Discount
Avg_Discount$Avg_Discount <- round(Avg_Discount$Avg_Discount,2)
print(Avg_Discount)
## # A tibble: 10 Ă— 2
## payment_method Avg_Discount
## <fct> <dbl>
## 1 easypay_voucher 2196.
## 2 jazzvoucher 1383.
## 3 Payaxis 752.
## 4 apg 729.
## 5 Easypay_MA 702.
## 6 Easypay 680.
## 7 jazzwallet 473.
## 8 bankalfalah 393.
## 9 cod 128.
## 10 customercredit 67.4
Avg_Discount %>% ggplot(aes(payment_method, Avg_Discount)) +
geom_bar(stat = "identity", fill = "#0D1282" ) +
coord_flip() +
labs(title = "Avg Discount by Payment Method", x = "Discount Amount", y = "Payment Method")
ecom %>% select(Customer.Since, Year) %>%
group_by(Customer.Since) %>%
summarise(Total_Customers = n()) -> customers_by_month
print(customers_by_month)
## # A tibble: 27 Ă— 2
## Customer.Since Total_Customers
## <date> <int>
## 1 2016-07-01 57069
## 2 2016-08-01 21051
## 3 2016-09-01 46746
## 4 2016-10-01 17546
## 5 2016-11-01 82714
## 6 2016-12-01 10638
## 7 2017-01-01 12260
## 8 2017-02-01 10293
## 9 2017-03-01 18234
## 10 2017-04-01 16826
## # ℹ 17 more rows
customers_by_month %>% ggplot(aes(Customer.Since, Total_Customers)) +
geom_point(color="#0D1282") +
geom_line(color="#0D1282") +
geom_smooth(method = lm, color = "#0D1282") +
labs(title = "Customers by Year", x = "Year", y = "# of Customers")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 row containing missing values (`geom_line()`).
ecom %>% select(Year, item_id) %>%
group_by(Year) %>%
summarise(Total_Order = n()) -> Orders_Per_Year
bar_chart <- Orders_Per_Year %>%ggplot(aes(Year, Total_Order)) +
geom_bar(stat = "identity", fill = "#0D1282") +
labs(title = "# of Order by Year", x = "Years" , y = "# of Orders")
print(bar_chart)
ecom %>% select(FY, grand_total) %>%
group_by(FY) %>%
summarise(Total_Sales_in_Arabs = sum(grand_total/10^9)) -> sales_year
sales_year %>% ggplot(aes(FY, Total_Sales_in_Arabs)) +
geom_bar(fill="#0D1282", stat = "identity") +
labs(title = "Total Sales by Fiscal Year", x= "Fiscal Year", y = "Total Sales in Arab")
Thank You…..