#Summary
#Data preprocessing
##Creating the target directory
# Create directory
dir.create("UK-Based_business_data", showWarnings = FALSE)
# Download dataset using Kaggle CLI
system(
"kaggle datasets download -d gabrielramos87/an-online-shop-business -p UK-Based_business_data --unzip"
)
## [1] 0
#Reading the dataset
retail <- read.csv(
"UK-Based_business_data/Sales Transaction v.4a.csv",
stringsAsFactors = FALSE
)
str(retail)
## 'data.frame': 536350 obs. of 8 variables:
## $ TransactionNo: chr "581482" "581475" "581475" "581475" ...
## $ Date : chr "12/9/2019" "12/9/2019" "12/9/2019" "12/9/2019" ...
## $ ProductNo : chr "22485" "22596" "23235" "23272" ...
## $ ProductName : chr "Set Of 2 Wooden Market Crates" "Christmas Star Wish List Chalkboard" "Storage Tin Vintage Leaf" "Tree T-Light Holder Willie Winkie" ...
## $ Price : num 21.5 10.6 11.5 10.6 11.9 ...
## $ Quantity : int 12 36 12 12 6 24 18 12 12 24 ...
## $ CustomerNo : int 17490 13069 13069 13069 13069 13069 13069 13069 13069 13069 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
#Loading the packages
library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.4 v readr 2.1.6
## v forcats 1.0.1 v stringr 1.6.0
## v ggplot2 4.0.1 v tibble 3.3.0
## v lubridate 1.9.4 v tidyr 1.3.2
## v purrr 1.2.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(knitr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
#Data cleaning and preparation
##This cleaning was done to Remove cancelled transactions, negative quantities, prices and Convert the Date to proper date format
Create Revenue variable
clean_retail <- retail %>%
filter(!grepl("^C", TransactionNo)) %>% # Remove cancelled invoices
filter(Quantity > 0, Price > 0) %>% # Remove negative values
mutate(
Date = as.Date(Date),
Month = floor_date(Date, "month"),
Revenue = Quantity * Price
)
summary(clean_retail)
## TransactionNo Date ProductNo ProductName
## Length:527765 Min. :0001-04-20 Length:527765 Length:527765
## Class :character 1st Qu.:0005-09-20 Class :character Class :character
## Mode :character Median :0009-05-20 Mode :character Mode :character
## Mean :0008-07-12
## 3rd Qu.:0011-10-20
## Max. :0012-12-20
## NA's :301119
## Price Quantity CustomerNo Country
## Min. : 5.13 Min. : 1.0 Min. :12004 Length:527765
## 1st Qu.: 10.99 1st Qu.: 1.0 1st Qu.:13813 Class :character
## Median : 11.94 Median : 3.0 Median :15159 Mode :character
## Mean : 12.63 Mean : 10.6 Mean :15232
## 3rd Qu.: 14.09 3rd Qu.: 11.0 3rd Qu.:16729
## Max. :660.62 Max. :80995.0 Max. :18287
## NA's :1
## Month Revenue
## Min. :0001-04-01 Min. :5.130e+00
## 1st Qu.:0005-09-01 1st Qu.:1.717e+01
## Median :0009-05-01 Median :4.383e+01
## Mean :0008-06-23 Mean :1.193e+02
## 3rd Qu.:0011-10-01 3rd Qu.:1.194e+02
## Max. :0012-12-01 Max. :1.003e+06
## NA's :301119
colnames(clean_retail)
## [1] "TransactionNo" "Date" "ProductNo" "ProductName"
## [5] "Price" "Quantity" "CustomerNo" "Country"
## [9] "Month" "Revenue"
#Sales trend over the month
monthly_sales <- clean_retail %>%
group_by(Month) %>%
summarise(Total_Sales = sum(Revenue))
monthly_sales
## # A tibble: 121 x 2
## Month Total_Sales
## <date> <dbl>
## 1 0001-04-01 102906.
## 2 0001-05-01 225004.
## 3 0001-06-01 267325.
## 4 0001-07-01 189552.
## 5 0001-09-01 96938.
## 6 0001-10-01 155159.
## 7 0001-11-01 358480.
## 8 0001-12-01 132940.
## 9 0002-01-01 183508.
## 10 0002-02-01 113883.
## # i 111 more rows
ggplot(monthly_sales, aes(Month, Total_Sales)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point() +
scale_y_continuous(
labels = scales::label_dollar(prefix = "£")
) +
labs(
title = "Monthly Sales Trend",
x = "Month",
y = "Total Sales (£)"
) +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
#Checking for most frequently purchased product
top_products <- clean_retail %>%
group_by(ProductName) %>%
summarise(Total_Quantity = sum(Quantity)) %>%
arrange(desc(Total_Quantity)) %>%
slice_head(n = 10)
kable(top_products)
| ProductName | Total_Quantity |
|---|---|
| Paper Craft Little Birdie | 80995 |
| Medium Ceramic Top Storage Jar | 78033 |
| Popcorn Holder | 56921 |
| World War 2 Gliders Asstd Designs | 55047 |
| Jumbo Bag Red Retrospot | 48478 |
| Cream Hanging Heart T-Light Holder | 37956 |
| Pack Of 72 Retrospot Cake Cases | 36515 |
| Assorted Colour Bird Ornament | 36493 |
| Rabbit Night Light | 30788 |
| Mini Paint Set Vintage | 26633 |
ggplot(top_products, aes(reorder(ProductName, Total_Quantity), Total_Quantity)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(
title = "Top 10 Most Frequently Purchased Products",
x = "ProductName",
y = "Total Quantity Purchased"
) +
theme_minimal()
#Checking for number of products purcased per transaction
transaction_items <- clean_retail %>%
group_by(TransactionNo) %>%
summarise(Total_Items = sum(Quantity))
summary(transaction_items$Total_Items)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 71.0 153.0 282.5 298.0 80995.0
ggplot(transaction_items, aes(Total_Items)) +
geom_histogram(bins = 50, fill = "orange", color = "black") +
labs(
title = "Distribution of Products Purchased Per Transaction",
x = "Number of Items",
y = "Frequency"
) +
theme_minimal()
#Most profitable customer segment
##Profit by country
country_profit <- clean_retail %>%
group_by(Country) %>%
summarise(Total_Revenue = sum(Revenue)) %>%
arrange(desc(Total_Revenue))
kable(head(country_profit, 10))
| Country | Total_Revenue |
|---|---|
| United Kingdom | 52524658.5 |
| Netherlands | 2151553.6 |
| EIRE | 1713410.9 |
| Germany | 1371543.3 |
| France | 1330652.9 |
| Australia | 995607.9 |
| Sweden | 401879.9 |
| Switzerland | 361969.2 |
| Japan | 293155.4 |
| Spain | 281012.3 |
##Profit by Customer
customer_profit <- clean_retail %>%
group_by(CustomerNo) %>%
summarise(Total_Revenue = sum(Revenue)) %>%
arrange(desc(Total_Revenue))
kable(head(customer_profit, 10))
| CustomerNo | Total_Revenue |
|---|---|
| 14646 | 2112282.0 |
| 16446 | 1002741.6 |
| 14911 | 914633.5 |
| 12415 | 900545.5 |
| 18102 | 897137.4 |
| 17450 | 891438.5 |
| 12346 | 840113.8 |
| 14156 | 694965.0 |
| 13694 | 646116.8 |
| 17511 | 639006.2 |
#Conclusion and Recommendation
##Conclusion
##Business Recommendations