Background Context:

Data dictionary: - Brand_Names: brand of the product - Details: Details of the product - Sizes: Sizing Availble - MRP: Max Retail Price - Sell_Price: Price sold after discount - Category: Category of the product

Business Objective:

Importing Libraries & Dataset

library(readr)
library(dplyr)
library(lubridate)
library(tidyr)
library(ggplot2)
library(stringr)

Raw Data Overview

str(df)
## spc_tbl_ [30,758 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1     : num [1:30758] 0 1 2 3 4 5 6 7 8 9 ...
##  $ BrandName: chr [1:30758] "life" "only" "fratini" "zink london" ...
##  $ Deatils  : chr [1:30758] "solid cotton blend collar neck womens a-line dress - indigo" "polyester peter pan collar womens blouson dress - yellow" "solid polyester blend wide neck womens regular top - off white" "stripes polyester sweetheart neck womens dress - black" ...
##  $ Sizes    : chr [1:30758] "Size:Large,Medium,Small,X-Large,X-Small" "Size:34,36,38,40" "Size:Large,X-Large,XX-Large" "Size:Large,Medium,Small,X-Large" ...
##  $ MRP      : chr [1:30758] "Rs\n1699" "Rs\n3499" "Rs\n1199" "Rs\n2299" ...
##  $ SellPrice: num [1:30758] 849 2449 599 1379 849 ...
##  $ Discount : chr [1:30758] "50% off" "30% off" "50% off" "40% off" ...
##  $ Category : chr [1:30758] "Westernwear-Women" "Westernwear-Women" "Westernwear-Women" "Westernwear-Women" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   BrandName = col_character(),
##   ..   Deatils = col_character(),
##   ..   Sizes = col_character(),
##   ..   MRP = col_character(),
##   ..   SellPrice = col_double(),
##   ..   Discount = col_character(),
##   ..   Category = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
colnames(df)
## [1] "...1"      "BrandName" "Deatils"   "Sizes"     "MRP"       "SellPrice"
## [7] "Discount"  "Category"

Data Cleaning

# Step 1: Removing the row number column
df <- df[ , -1]
# Step 2: Renaming columns
df <- rename(df, Brand = BrandName, Sale_Price = SellPrice)
# Step 3: Remove "Rs\n" from MRP column
df$MRP <- gsub("Rs\n", "", df$MRP)
# Step 4: Remove "Size:" from Sizes column
df$Sizes <- gsub("Size:", "", df$Sizes)
# Step 5: Remove "% off" from Discount column
df$Discount <- gsub("% off", "", df$Discount)
# Step 6: Convert Sale_Price & Discount columns into int
df$Sale_Price <- as.integer(df$Sale_Price)
df$Discount <- as.integer(df$Discount)

# Checking the structure of df to confirm changes
str(df)
## tibble [30,758 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Brand     : chr [1:30758] "life" "only" "fratini" "zink london" ...
##  $ Deatils   : chr [1:30758] "solid cotton blend collar neck womens a-line dress - indigo" "polyester peter pan collar womens blouson dress - yellow" "solid polyester blend wide neck womens regular top - off white" "stripes polyester sweetheart neck womens dress - black" ...
##  $ Sizes     : chr [1:30758] "Large,Medium,Small,X-Large,X-Small" "34,36,38,40" "Large,X-Large,XX-Large" "Large,Medium,Small,X-Large" ...
##  $ MRP       : chr [1:30758] "1699" "3499" "1199" "2299" ...
##  $ Sale_Price: int [1:30758] 849 2449 599 1379 849 1615 649 649 839 599 ...
##  $ Discount  : int [1:30758] 50 30 50 40 50 10 50 50 60 60 ...
##  $ Category  : chr [1:30758] "Westernwear-Women" "Westernwear-Women" "Westernwear-Women" "Westernwear-Women" ...
# Step 7: Drop NA Values
df <- drop_na(df)
# Display unique values in the Category column
unique_categories <- unique(df$Category)

# Print the unique Categories
print(unique_categories)
## [1] "Westernwear-Women"        "Indianwear-Women"        
## [3] "Lingerie&Nightwear-Women" "Footwear-Women"          
## [5] "Watches-Women"            "Fragrance-Women"         
## [7] "Jewellery-Women"
# Step 8: Removing rows where Category is "Watches-Women" or "Jewellery-Women"
df <- df %>%
  filter(!Category %in% c("Watches-Women", "Jewellery-Women", "Fragrance-Women"))

Visualizations

Selling Price Distribution

# Price Distribution Plot
ggplot(df, aes(x=Sale_Price)) +
  geom_histogram(aes(y=..density..), binwidth=500, colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") +
  labs(title="Price Distribution Plot", x="Sell Price", y="Density")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Price Spread (Boxplot)
ggplot(df, aes(y=Sale_Price)) +
  geom_boxplot(fill="lightblue", colour="black") +
  labs(title="Price Spread", x="", y="Sell Price")

# Calculate the frequency of each Sale_Price
prices <- df %>%
  count(Sale_Price) %>%
  arrange(Sale_Price)

# Create a scatter plot to visualize the frequencies
ggplot(prices, aes(x=Sale_Price, y=n)) +
  geom_point(alpha=0.6, size=3) +
  theme_minimal() +
  labs(title="Selling Prices", x="Sale Price", y="Frequency") +
  theme(panel.grid.major = element_line(color = "grey", size = 0.5),
        panel.grid.minor = element_blank())
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Basic Descriptive Statistics
basic_stats <- df %>% summarise(Mean = mean(Sale_Price, na.rm = TRUE),
                                SD = sd(Sale_Price, na.rm = TRUE),
                                Min = min(Sale_Price, na.rm = TRUE),
                                Max = max(Sale_Price, na.rm = TRUE))

# Specific Percentiles
percentiles <- quantile(df$Sale_Price, probs = c(0.25, 0.50, 0.75, 0.85, 0.95), na.rm = TRUE)

# Combine both sets of information
summary_stats <- cbind(basic_stats, t(as.data.frame(percentiles)))

print(summary_stats)
##                 Mean      SD Min   Max 25%  50%  75%  85%  95%
## percentiles 1236.735 770.807  74 13599 699 1039 1609 2021 2655

Inferences:

  • From these two distribution graphs we can visualize the selling price of the current inventory.
    • The plot seems to be right skewed, meaning that most of the inventory are “higher” price items.
    • There seems to be significant difference between the mean and median.
    • Approximately 95% of the items are selling less than 2659.
    • The data points are slightly spread out from the mean, which indicates low variance in price. (85% of price are below 1819 whereas the remaining are between 1819 - 6999)

Most Expensive brands to Consumers

# Calculate the maximum SellPrice for each BrandName and Category combination
max_price <- df %>%
  group_by(Brand, Category) %>%
  summarise(MaxSellPrice = max(Sale_Price, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(desc(MaxSellPrice)) %>%
  head(10)
## `summarise()` has grouped output by 'Brand'. You can override using the
## `.groups` argument.
# Display the top 10 results
print(max_price)
## # A tibble: 10 × 3
##    Brand       Category          MaxSellPrice
##    <chr>       <chr>                    <int>
##  1 global desi Indianwear-Women         13599
##  2 w           Indianwear-Women          7499
##  3 new balance Footwear-Women            6999
##  4 clarks      Footwear-Women            6299
##  5 guess       Footwear-Women            6299
##  6 forever new Westernwear-Women         6000
##  7 biba        Indianwear-Women          5596
##  8 myshka      Indianwear-Women          5399
##  9 juniper     Indianwear-Women          5157
## 10 modare      Footwear-Women            4899
# Bar plot for the top 10 costly clothing brands
ggplot(max_price, aes(x=reorder(Brand, -MaxSellPrice), y=MaxSellPrice, fill=Brand)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title="Top 10 Costly Brands", x="Brand Name", y="Maximum Sell Price") +
  theme(axis.text.x = element_text(angle=45, hjust=1), legend.position="none")

# Bar plot for the categories within the top 10 costly brands/categories
ggplot(max_price, aes(x=reorder(Category, -MaxSellPrice), y=MaxSellPrice, fill=Category)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title="Top Costly Categories within Top 10 Entries", x="Category", y="Maximum Sell Price") +
  theme(axis.text.x = element_text(angle=45, hjust=1), legend.position="none")

Inferences:

  • These are the top 10 most costly brands in our inventory.
    • Majority of the brands are in the Footwear-Womens and Western-Women Category.

Most Affordable Brands to Consumers

# Calculate the minimum SellPrice for each BrandName and Category combination
min_price <- df %>%
  group_by(Brand, Category) %>%
  summarise(MinSellPrice = min(Sale_Price, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(MinSellPrice) %>%
  head(10)
## `summarise()` has grouped output by 'Brand'. You can override using the
## `.groups` argument.
# Display the top 10 results with the lowest minimum SellPrice
print(min_price)
## # A tibble: 10 × 3
##    Brand   Category                 MinSellPrice
##    <chr>   <chr>                           <int>
##  1 life    Westernwear-Women                  74
##  2 clovia  Lingerie&Nightwear-Women          114
##  3 insense Lingerie&Nightwear-Women          119
##  4 nayomi  Lingerie&Nightwear-Women          149
##  5 enamor  Lingerie&Nightwear-Women          150
##  6 soie    Lingerie&Nightwear-Women          192
##  7 wacoal  Lingerie&Nightwear-Women          197
##  8 altlife Westernwear-Women                 199
##  9 infuse  Lingerie&Nightwear-Women          199
## 10 infuse  Westernwear-Women                 199
# Top 10 Cheap Brands based on minimum SellPrice
ggplot(min_price, aes(x=reorder(Brand, MinSellPrice), y=MinSellPrice, fill=Brand)) +
  geom_bar(stat="identity") +
  labs(title="Top 10 Cheap Brands", x="Brand Name", y="Minimum Sell Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=65, hjust=1), legend.position="none")

  • These are the top 10 Cheapest brands in our inventory.
# Top Cheap Categories within the top 10 entries
ggplot(min_price, aes(x=reorder(Category, MinSellPrice), y=MinSellPrice, fill=Category)) +
  geom_bar(stat="identity") +
  labs(title="Top Cheap Categories within Top 10 Entries", x="Category", y="Minimum Sell Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=65, hjust=1), legend.position="none")

Inferences:

  • The cheapest categories are Western-Women and Lingerie&Nightwear-Women
    • However Western-Women was also ranked as the most Costly aswell. (Meaning that Western-Women clothing has drastic price range from cheap to costly)
# Count the occurrences of each BrandName for each SellPrice
df_summary <- df %>%
  group_by(Sale_Price, Brand) %>%
  summarise(Count = n()) %>%
  ungroup() %>%
  arrange(desc(Count)) %>%
  head(20)
## `summarise()` has grouped output by 'Sale_Price'. You can override using the
## `.groups` argument.
# Plot the top 20 SellPrice and BrandName combinations
ggplot(df_summary, aes(x=reorder(paste(Brand, Sale_Price, sep = "_"), Count), y=Count)) +
  geom_col(fill = '#1a55FF') +
  labs(title = "Top 20 SellPrice and BrandName Combinations", x = "BrandName and SellPrice", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=65, hjust=1))

- The plot above shows the Top selling brands along it’s Selling Price. - As we can see one of the most affordable brands (Life) makes up for its cost with higher volume of sales.

# Aggregating category counts
inv_clothing <- df %>%
  group_by(Category) %>%
  summarise(Count = n())

# Creating the bar plot
ggplot(inv_clothing, aes(x=Category, y=Count, fill=Category)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title="Category Counts", x="Category", y="Count") +
  theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))

- The Plot above shows the volume of sales from each category. - Although in the category Footwear-Woman has the highest ticketed items in our inventory, they make up the least volume of our inventory. - Following Lingerie&Nightwear-Woman with slightly higher volume of our inventory. - Following Western-Woman category with the highest volume of our inventory.(This explains the wide range of price within this category)

# Recalculating category counts for the dot and line plot
inv_clothing <- table(df$Category)

# Creating the dot and line plot
plot(inv_clothing, type="o", pch=20, cex=2, lwd=2, col="blue", xlab="Category", ylab="Count")
title("Category Counts")

# Filter the data for the specified categories
filtered_df <- df %>%
  filter(Category %in% c("Footwear-Women", "Westernwear-Women", "Lingerie&Nightwear-Women"))

# Aggregate the data to count items per brand within these categories
brand_counts <- filtered_df %>%
  group_by(Category, Brand) %>%
  summarise(Count = n(), .groups = 'drop') %>%
  arrange(Category, desc(Count))

# For visualization, we will plot one category at a time to maintain clarity
plot_category <- function(data, category_name){
  top_brands <- data %>%
    filter(Category == category_name) %>%
    slice_max(Count, n = 5)
  
  ggplot(top_brands, aes(x=reorder(Brand, Count), y=Count, fill=Brand)) +
    geom_col() +
    coord_flip() +  # Makes it easier to read brand names
    labs(title=paste("Top 5 Selling Brands in", category_name),
         x="Brand Name", y="Count") +
    theme_minimal() +
    theme(legend.position="none") # Hide legend to avoid redundancy
}

# Create and display separate plots for each category
categories <- c("Footwear-Women", "Westernwear-Women", "Lingerie&Nightwear-Women")
lapply(categories, function(cat) plot_category(brand_counts, cat))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## Inferences: - The plots above shows the Top 5 selling brands in our current inventory. - From the brands and volume of sales we can make a rough customer segmentation on our customers. As majority of the brands are more cost effective rather than the more costly brands within all categories of our inventory. (Life, Zink London, Catwalk, Enamor, etc.)

# Brand and Discount Distribution
df %>%
  group_by(Brand) %>%
  summarise(AverageDiscount = mean(Discount, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(Brand, -AverageDiscount), y = AverageDiscount)) +
  geom_col() +
  coord_flip() +
  labs(title = "Average Discount by BrandName", x = "Brand Name", y = "Average Discount") +
  theme_minimal()

- The visualization above depicts the discounts or “break-even” brand items that the online retail store struggle moving.

df %>%
  count(Discount) %>%
  arrange(desc(n)) %>%
  head(30) %>%
  ggplot(aes(x = reorder(Discount, n), y = n)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 100 Discounts by Frequency", x = "Discount", y = "Frequency") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

- The plot above shows the individual items within our inventory that have a record of being discounted, due to product shelf life. - Particularly items (60, 20, 40, 30, 50)

# Calculate the maximum discount for each category
Dis <- df %>%
  group_by(Category) %>%
  summarise(MaxDiscount = max(Discount, na.rm = TRUE))


# Create the plot
ggplot(Dis, aes(x = Category, y = MaxDiscount)) +
  geom_point(alpha=0.6, size=5, color="blue") +  # Dots with specified properties
  geom_line(color="red", group=1) +  # Connects the points with lines
  labs(title = "Maximum Discount by Category", x = "Category", y = "Maximum Discount (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +  # Rotate category names for readability
  theme(panel.grid.major = element_line(color = "grey", size = 0.25))  # Customize grid

- Westernwear-Woman and Lingerie&Nightwear-Woman category has the highest rate of discounts

# Prepare data for "BrandName and Sizes" plot
brand_sizes_count <- df %>%
  count(Brand, Sizes) %>%
  arrange(desc(n)) %>%
  head(10)

# Prepare data for "Sizes Count" plot
sizes_count <- df %>%
  count(Sizes) %>%
  arrange(desc(n)) %>%
  head(10)
ggplot(brand_sizes_count, aes(x=reorder(paste(Brand, Sizes, sep="-"), n), y=n, fill=Brand)) +
  geom_bar(stat="identity", color="#bcbd22", show.legend = FALSE) +
  coord_flip() +
  labs(title="Top 10 BrandName and Sizes Combinations", x="BrandName and Sizes", y="Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=45, hjust=1))

ggplot(sizes_count, aes(x=reorder(Sizes, n), y=n, fill=Sizes)) +
  geom_bar(stat="identity", color="#e377c2", show.legend = FALSE) +
  labs(title="Top 10 Most Common Sizes", x="Sizes", y="Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  guides(fill=guide_legend(title="Sizes"))

- The plots above show the sizing range of the brands within our category, as the top performing sizing are either standard sizing or following larger sizing clothing items.