#Summary

#Data preprocessing

##Creating the target directory

# Create directory
dir.create("UK-Based_business_data", showWarnings = FALSE)
# Download dataset using Kaggle CLI
system(
  "kaggle datasets download -d gabrielramos87/an-online-shop-business -p UK-Based_business_data --unzip"
)
## [1] 0

#Reading the dataset

retail <- read.csv(
  "UK-Based_business_data/Sales Transaction v.4a.csv",
  stringsAsFactors = FALSE
)

str(retail)
## 'data.frame':    536350 obs. of  8 variables:
##  $ TransactionNo: chr  "581482" "581475" "581475" "581475" ...
##  $ Date         : chr  "12/9/2019" "12/9/2019" "12/9/2019" "12/9/2019" ...
##  $ ProductNo    : chr  "22485" "22596" "23235" "23272" ...
##  $ ProductName  : chr  "Set Of 2 Wooden Market Crates" "Christmas Star Wish List Chalkboard" "Storage Tin Vintage Leaf" "Tree T-Light Holder Willie Winkie" ...
##  $ Price        : num  21.5 10.6 11.5 10.6 11.9 ...
##  $ Quantity     : int  12 36 12 12 6 24 18 12 12 24 ...
##  $ CustomerNo   : int  17490 13069 13069 13069 13069 13069 13069 13069 13069 13069 ...
##  $ Country      : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...

#Loading the packages

library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.4     v readr     2.1.6
## v forcats   1.0.1     v stringr   1.6.0
## v ggplot2   4.0.1     v tibble    3.3.0
## v lubridate 1.9.4     v tidyr     1.3.2
## v purrr     1.2.0     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(knitr)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

#Data cleaning and preparation

##This cleaning was done to Remove cancelled transactions, negative quantities, prices and Convert the Date to proper date format

Create Revenue variable

clean_retail <- retail %>%
  filter(!grepl("^C", TransactionNo)) %>%   # Remove cancelled invoices
  filter(Quantity > 0, Price > 0) %>%        # Remove negative values
  mutate(
    Date = as.Date(Date),
    Month = floor_date(Date, "month"),
    Revenue = Quantity * Price
  )

summary(clean_retail)
##  TransactionNo           Date             ProductNo         ProductName       
##  Length:527765      Min.   :0001-04-20   Length:527765      Length:527765     
##  Class :character   1st Qu.:0005-09-20   Class :character   Class :character  
##  Mode  :character   Median :0009-05-20   Mode  :character   Mode  :character  
##                     Mean   :0008-07-12                                        
##                     3rd Qu.:0011-10-20                                        
##                     Max.   :0012-12-20                                        
##                     NA's   :301119                                            
##      Price           Quantity         CustomerNo      Country         
##  Min.   :  5.13   Min.   :    1.0   Min.   :12004   Length:527765     
##  1st Qu.: 10.99   1st Qu.:    1.0   1st Qu.:13813   Class :character  
##  Median : 11.94   Median :    3.0   Median :15159   Mode  :character  
##  Mean   : 12.63   Mean   :   10.6   Mean   :15232                     
##  3rd Qu.: 14.09   3rd Qu.:   11.0   3rd Qu.:16729                     
##  Max.   :660.62   Max.   :80995.0   Max.   :18287                     
##                                     NA's   :1                         
##      Month               Revenue         
##  Min.   :0001-04-01   Min.   :5.130e+00  
##  1st Qu.:0005-09-01   1st Qu.:1.717e+01  
##  Median :0009-05-01   Median :4.383e+01  
##  Mean   :0008-06-23   Mean   :1.193e+02  
##  3rd Qu.:0011-10-01   3rd Qu.:1.194e+02  
##  Max.   :0012-12-01   Max.   :1.003e+06  
##  NA's   :301119
colnames(clean_retail)
##  [1] "TransactionNo" "Date"          "ProductNo"     "ProductName"  
##  [5] "Price"         "Quantity"      "CustomerNo"    "Country"      
##  [9] "Month"         "Revenue"

#Sales trend over the month

monthly_sales <- clean_retail %>%
  group_by(Month) %>%
  summarise(Total_Sales = sum(Revenue))

monthly_sales
## # A tibble: 121 x 2
##    Month      Total_Sales
##    <date>           <dbl>
##  1 0001-04-01     102906.
##  2 0001-05-01     225004.
##  3 0001-06-01     267325.
##  4 0001-07-01     189552.
##  5 0001-09-01      96938.
##  6 0001-10-01     155159.
##  7 0001-11-01     358480.
##  8 0001-12-01     132940.
##  9 0002-01-01     183508.
## 10 0002-02-01     113883.
## # i 111 more rows
ggplot(monthly_sales, aes(Month, Total_Sales)) +
  geom_line(color = "steelblue", linewidth = 1) +
  geom_point() +
  scale_y_continuous(
    labels = scales::label_dollar(prefix = "£")
  ) +
  labs(
    title = "Monthly Sales Trend",
    x = "Month",
    y = "Total Sales (£)"
  ) +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#Checking for most frequently purchased product

top_products <- clean_retail %>%
  group_by(ProductName) %>%
  summarise(Total_Quantity = sum(Quantity)) %>%
  arrange(desc(Total_Quantity)) %>%
  slice_head(n = 10)

kable(top_products)
ProductName Total_Quantity
Paper Craft Little Birdie 80995
Medium Ceramic Top Storage Jar 78033
Popcorn Holder 56921
World War 2 Gliders Asstd Designs 55047
Jumbo Bag Red Retrospot 48478
Cream Hanging Heart T-Light Holder 37956
Pack Of 72 Retrospot Cake Cases 36515
Assorted Colour Bird Ornament 36493
Rabbit Night Light 30788
Mini Paint Set Vintage 26633
ggplot(top_products, aes(reorder(ProductName, Total_Quantity), Total_Quantity)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Top 10 Most Frequently Purchased Products",
    x = "ProductName",
    y = "Total Quantity Purchased"
  ) +
  theme_minimal()

#Checking for number of products purcased per transaction

transaction_items <- clean_retail %>%
  group_by(TransactionNo) %>%
  summarise(Total_Items = sum(Quantity))

summary(transaction_items$Total_Items)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    71.0   153.0   282.5   298.0 80995.0
ggplot(transaction_items, aes(Total_Items)) +
  geom_histogram(bins = 50, fill = "orange", color = "black") +
  labs(
    title = "Distribution of Products Purchased Per Transaction",
    x = "Number of Items",
    y = "Frequency"
  ) +
  theme_minimal()

#Most profitable customer segment

##Profit by country

country_profit <- clean_retail %>%
  group_by(Country) %>%
  summarise(Total_Revenue = sum(Revenue)) %>%
  arrange(desc(Total_Revenue))

kable(head(country_profit, 10))
Country Total_Revenue
United Kingdom 52524658.5
Netherlands 2151553.6
EIRE 1713410.9
Germany 1371543.3
France 1330652.9
Australia 995607.9
Sweden 401879.9
Switzerland 361969.2
Japan 293155.4
Spain 281012.3

##Profit by Customer

customer_profit <- clean_retail %>%
  group_by(CustomerNo) %>%
  summarise(Total_Revenue = sum(Revenue)) %>%
  arrange(desc(Total_Revenue))

kable(head(customer_profit, 10))
CustomerNo Total_Revenue
14646 2112282.0
16446 1002741.6
14911 914633.5
12415 900545.5
18102 897137.4
17450 891438.5
12346 840113.8
14156 694965.0
13694 646116.8
17511 639006.2

#Conclusion and Recommendation

##Conclusion

##Business Recommendations

  1. Focus on High-Value Customers; Introduction of loyalty programs for top-spending customers. Offering volume discounts for bulk buyers.
  2. Optimize Product Strategy; Bundle top-selling products. Ensure high-demand items are always in stock.
  3. Seasonal Marketing; Increase advertising and promotions during high-sales months especially towards th end of the year. Launching gift-themed campaigns before holidays.
  4. International Expansion; Target high-performing non-UK countries with localized shipping and marketing.
  5. Reduce Cancellations; Improve order confirmation and payment validation to reduce negative quantities and cancellations.