Data dictionary: - Brand_Names: brand of the product - Details: Details of the product - Sizes: Sizing Availble - MRP: Max Retail Price - Sell_Price: Price sold after discount - Category: Category of the product
library(readr)
library(dplyr)
library(lubridate)
library(tidyr)
library(ggplot2)
library(stringr)
str(df)
## spc_tbl_ [30,758 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:30758] 0 1 2 3 4 5 6 7 8 9 ...
## $ BrandName: chr [1:30758] "life" "only" "fratini" "zink london" ...
## $ Deatils : chr [1:30758] "solid cotton blend collar neck womens a-line dress - indigo" "polyester peter pan collar womens blouson dress - yellow" "solid polyester blend wide neck womens regular top - off white" "stripes polyester sweetheart neck womens dress - black" ...
## $ Sizes : chr [1:30758] "Size:Large,Medium,Small,X-Large,X-Small" "Size:34,36,38,40" "Size:Large,X-Large,XX-Large" "Size:Large,Medium,Small,X-Large" ...
## $ MRP : chr [1:30758] "Rs\n1699" "Rs\n3499" "Rs\n1199" "Rs\n2299" ...
## $ SellPrice: num [1:30758] 849 2449 599 1379 849 ...
## $ Discount : chr [1:30758] "50% off" "30% off" "50% off" "40% off" ...
## $ Category : chr [1:30758] "Westernwear-Women" "Westernwear-Women" "Westernwear-Women" "Westernwear-Women" ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. BrandName = col_character(),
## .. Deatils = col_character(),
## .. Sizes = col_character(),
## .. MRP = col_character(),
## .. SellPrice = col_double(),
## .. Discount = col_character(),
## .. Category = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
colnames(df)
## [1] "...1" "BrandName" "Deatils" "Sizes" "MRP" "SellPrice"
## [7] "Discount" "Category"
# Step 1: Removing the row number column
df <- df[ , -1]
# Step 2: Renaming columns
df <- rename(df, Brand = BrandName, Sale_Price = SellPrice)
# Step 3: Remove "Rs\n" from MRP column
df$MRP <- gsub("Rs\n", "", df$MRP)
# Step 4: Remove "Size:" from Sizes column
df$Sizes <- gsub("Size:", "", df$Sizes)
# Step 5: Remove "% off" from Discount column
df$Discount <- gsub("% off", "", df$Discount)
# Step 6: Convert Sale_Price & Discount columns into int
df$Sale_Price <- as.integer(df$Sale_Price)
df$Discount <- as.integer(df$Discount)
# Checking the structure of df to confirm changes
str(df)
## tibble [30,758 × 7] (S3: tbl_df/tbl/data.frame)
## $ Brand : chr [1:30758] "life" "only" "fratini" "zink london" ...
## $ Deatils : chr [1:30758] "solid cotton blend collar neck womens a-line dress - indigo" "polyester peter pan collar womens blouson dress - yellow" "solid polyester blend wide neck womens regular top - off white" "stripes polyester sweetheart neck womens dress - black" ...
## $ Sizes : chr [1:30758] "Large,Medium,Small,X-Large,X-Small" "34,36,38,40" "Large,X-Large,XX-Large" "Large,Medium,Small,X-Large" ...
## $ MRP : chr [1:30758] "1699" "3499" "1199" "2299" ...
## $ Sale_Price: int [1:30758] 849 2449 599 1379 849 1615 649 649 839 599 ...
## $ Discount : int [1:30758] 50 30 50 40 50 10 50 50 60 60 ...
## $ Category : chr [1:30758] "Westernwear-Women" "Westernwear-Women" "Westernwear-Women" "Westernwear-Women" ...
# Step 7: Drop NA Values
df <- drop_na(df)
# Display unique values in the Category column
unique_categories <- unique(df$Category)
# Print the unique Categories
print(unique_categories)
## [1] "Westernwear-Women" "Indianwear-Women"
## [3] "Lingerie&Nightwear-Women" "Footwear-Women"
## [5] "Watches-Women" "Fragrance-Women"
## [7] "Jewellery-Women"
# Step 8: Removing rows where Category is "Watches-Women" or "Jewellery-Women"
df <- df %>%
filter(!Category %in% c("Watches-Women", "Jewellery-Women", "Fragrance-Women"))
# Price Distribution Plot
ggplot(df, aes(x=Sale_Price)) +
geom_histogram(aes(y=..density..), binwidth=500, colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
labs(title="Price Distribution Plot", x="Sell Price", y="Density")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Price Spread (Boxplot)
ggplot(df, aes(y=Sale_Price)) +
geom_boxplot(fill="lightblue", colour="black") +
labs(title="Price Spread", x="", y="Sell Price")
# Calculate the frequency of each Sale_Price
prices <- df %>%
count(Sale_Price) %>%
arrange(Sale_Price)
# Create a scatter plot to visualize the frequencies
ggplot(prices, aes(x=Sale_Price, y=n)) +
geom_point(alpha=0.6, size=3) +
theme_minimal() +
labs(title="Selling Prices", x="Sale Price", y="Frequency") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_blank())
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Basic Descriptive Statistics
basic_stats <- df %>% summarise(Mean = mean(Sale_Price, na.rm = TRUE),
SD = sd(Sale_Price, na.rm = TRUE),
Min = min(Sale_Price, na.rm = TRUE),
Max = max(Sale_Price, na.rm = TRUE))
# Specific Percentiles
percentiles <- quantile(df$Sale_Price, probs = c(0.25, 0.50, 0.75, 0.85, 0.95), na.rm = TRUE)
# Combine both sets of information
summary_stats <- cbind(basic_stats, t(as.data.frame(percentiles)))
print(summary_stats)
## Mean SD Min Max 25% 50% 75% 85% 95%
## percentiles 1236.735 770.807 74 13599 699 1039 1609 2021 2655
# Calculate the maximum SellPrice for each BrandName and Category combination
max_price <- df %>%
group_by(Brand, Category) %>%
summarise(MaxSellPrice = max(Sale_Price, na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(MaxSellPrice)) %>%
head(10)
## `summarise()` has grouped output by 'Brand'. You can override using the
## `.groups` argument.
# Display the top 10 results
print(max_price)
## # A tibble: 10 × 3
## Brand Category MaxSellPrice
## <chr> <chr> <int>
## 1 global desi Indianwear-Women 13599
## 2 w Indianwear-Women 7499
## 3 new balance Footwear-Women 6999
## 4 clarks Footwear-Women 6299
## 5 guess Footwear-Women 6299
## 6 forever new Westernwear-Women 6000
## 7 biba Indianwear-Women 5596
## 8 myshka Indianwear-Women 5399
## 9 juniper Indianwear-Women 5157
## 10 modare Footwear-Women 4899
# Bar plot for the top 10 costly clothing brands
ggplot(max_price, aes(x=reorder(Brand, -MaxSellPrice), y=MaxSellPrice, fill=Brand)) +
geom_bar(stat="identity") +
theme_minimal() +
labs(title="Top 10 Costly Brands", x="Brand Name", y="Maximum Sell Price") +
theme(axis.text.x = element_text(angle=45, hjust=1), legend.position="none")
# Bar plot for the categories within the top 10 costly brands/categories
ggplot(max_price, aes(x=reorder(Category, -MaxSellPrice), y=MaxSellPrice, fill=Category)) +
geom_bar(stat="identity") +
theme_minimal() +
labs(title="Top Costly Categories within Top 10 Entries", x="Category", y="Maximum Sell Price") +
theme(axis.text.x = element_text(angle=45, hjust=1), legend.position="none")
Footwear-Womens and
Western-Women Category.# Calculate the minimum SellPrice for each BrandName and Category combination
min_price <- df %>%
group_by(Brand, Category) %>%
summarise(MinSellPrice = min(Sale_Price, na.rm = TRUE)) %>%
ungroup() %>%
arrange(MinSellPrice) %>%
head(10)
## `summarise()` has grouped output by 'Brand'. You can override using the
## `.groups` argument.
# Display the top 10 results with the lowest minimum SellPrice
print(min_price)
## # A tibble: 10 × 3
## Brand Category MinSellPrice
## <chr> <chr> <int>
## 1 life Westernwear-Women 74
## 2 clovia Lingerie&Nightwear-Women 114
## 3 insense Lingerie&Nightwear-Women 119
## 4 nayomi Lingerie&Nightwear-Women 149
## 5 enamor Lingerie&Nightwear-Women 150
## 6 soie Lingerie&Nightwear-Women 192
## 7 wacoal Lingerie&Nightwear-Women 197
## 8 altlife Westernwear-Women 199
## 9 infuse Lingerie&Nightwear-Women 199
## 10 infuse Westernwear-Women 199
# Top 10 Cheap Brands based on minimum SellPrice
ggplot(min_price, aes(x=reorder(Brand, MinSellPrice), y=MinSellPrice, fill=Brand)) +
geom_bar(stat="identity") +
labs(title="Top 10 Cheap Brands", x="Brand Name", y="Minimum Sell Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle=65, hjust=1), legend.position="none")
# Top Cheap Categories within the top 10 entries
ggplot(min_price, aes(x=reorder(Category, MinSellPrice), y=MinSellPrice, fill=Category)) +
geom_bar(stat="identity") +
labs(title="Top Cheap Categories within Top 10 Entries", x="Category", y="Minimum Sell Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle=65, hjust=1), legend.position="none")
Western-Women and
Lingerie&Nightwear-Women
Western-Women was also ranked as the most
Costly aswell. (Meaning that Western-Women clothing has drastic price
range from cheap to costly)# Count the occurrences of each BrandName for each SellPrice
df_summary <- df %>%
group_by(Sale_Price, Brand) %>%
summarise(Count = n()) %>%
ungroup() %>%
arrange(desc(Count)) %>%
head(20)
## `summarise()` has grouped output by 'Sale_Price'. You can override using the
## `.groups` argument.
# Plot the top 20 SellPrice and BrandName combinations
ggplot(df_summary, aes(x=reorder(paste(Brand, Sale_Price, sep = "_"), Count), y=Count)) +
geom_col(fill = '#1a55FF') +
labs(title = "Top 20 SellPrice and BrandName Combinations", x = "BrandName and SellPrice", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle=65, hjust=1))
- The plot above shows the Top selling brands along it’s Selling Price.
- As we can see one of the most affordable brands (Life) makes up for
its cost with higher volume of sales.
# Aggregating category counts
inv_clothing <- df %>%
group_by(Category) %>%
summarise(Count = n())
# Creating the bar plot
ggplot(inv_clothing, aes(x=Category, y=Count, fill=Category)) +
geom_bar(stat="identity") +
theme_minimal() +
labs(title="Category Counts", x="Category", y="Count") +
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))
- The Plot above shows the volume of sales from each category. -
Although in the category
Footwear-Woman has the highest
ticketed items in our inventory, they make up the least volume of our
inventory. - Following Lingerie&Nightwear-Woman with
slightly higher volume of our inventory. - Following
Western-Woman category with the highest volume of our
inventory.(This explains the wide range of price within this
category)
# Recalculating category counts for the dot and line plot
inv_clothing <- table(df$Category)
# Creating the dot and line plot
plot(inv_clothing, type="o", pch=20, cex=2, lwd=2, col="blue", xlab="Category", ylab="Count")
title("Category Counts")
# Filter the data for the specified categories
filtered_df <- df %>%
filter(Category %in% c("Footwear-Women", "Westernwear-Women", "Lingerie&Nightwear-Women"))
# Aggregate the data to count items per brand within these categories
brand_counts <- filtered_df %>%
group_by(Category, Brand) %>%
summarise(Count = n(), .groups = 'drop') %>%
arrange(Category, desc(Count))
# For visualization, we will plot one category at a time to maintain clarity
plot_category <- function(data, category_name){
top_brands <- data %>%
filter(Category == category_name) %>%
slice_max(Count, n = 5)
ggplot(top_brands, aes(x=reorder(Brand, Count), y=Count, fill=Brand)) +
geom_col() +
coord_flip() + # Makes it easier to read brand names
labs(title=paste("Top 5 Selling Brands in", category_name),
x="Brand Name", y="Count") +
theme_minimal() +
theme(legend.position="none") # Hide legend to avoid redundancy
}
# Create and display separate plots for each category
categories <- c("Footwear-Women", "Westernwear-Women", "Lingerie&Nightwear-Women")
lapply(categories, function(cat) plot_category(brand_counts, cat))
## [[1]]
##
## [[2]]
##
## [[3]]
## Inferences: - The plots above shows the Top 5 selling brands in our
current inventory. - From the brands and volume of sales we can make a
rough customer segmentation on our customers. As majority of the brands
are more cost effective rather than the more costly brands within all
categories of our inventory. (Life, Zink London, Catwalk, Enamor,
etc.)
# Brand and Discount Distribution
df %>%
group_by(Brand) %>%
summarise(AverageDiscount = mean(Discount, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(Brand, -AverageDiscount), y = AverageDiscount)) +
geom_col() +
coord_flip() +
labs(title = "Average Discount by BrandName", x = "Brand Name", y = "Average Discount") +
theme_minimal()
- The visualization above depicts the discounts or “break-even” brand
items that the online retail store struggle moving.
df %>%
count(Discount) %>%
arrange(desc(n)) %>%
head(30) %>%
ggplot(aes(x = reorder(Discount, n), y = n)) +
geom_bar(stat = "identity") +
labs(title = "Top 100 Discounts by Frequency", x = "Discount", y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
- The plot above shows the individual items within our inventory that
have a record of being discounted, due to product shelf life. -
Particularly items (60, 20, 40, 30, 50)
# Calculate the maximum discount for each category
Dis <- df %>%
group_by(Category) %>%
summarise(MaxDiscount = max(Discount, na.rm = TRUE))
# Create the plot
ggplot(Dis, aes(x = Category, y = MaxDiscount)) +
geom_point(alpha=0.6, size=5, color="blue") + # Dots with specified properties
geom_line(color="red", group=1) + # Connects the points with lines
labs(title = "Maximum Discount by Category", x = "Category", y = "Maximum Discount (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + # Rotate category names for readability
theme(panel.grid.major = element_line(color = "grey", size = 0.25)) # Customize grid
-
Westernwear-Woman and
Lingerie&Nightwear-Woman category has the highest rate
of discounts
# Prepare data for "BrandName and Sizes" plot
brand_sizes_count <- df %>%
count(Brand, Sizes) %>%
arrange(desc(n)) %>%
head(10)
# Prepare data for "Sizes Count" plot
sizes_count <- df %>%
count(Sizes) %>%
arrange(desc(n)) %>%
head(10)
ggplot(brand_sizes_count, aes(x=reorder(paste(Brand, Sizes, sep="-"), n), y=n, fill=Brand)) +
geom_bar(stat="identity", color="#bcbd22", show.legend = FALSE) +
coord_flip() +
labs(title="Top 10 BrandName and Sizes Combinations", x="BrandName and Sizes", y="Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggplot(sizes_count, aes(x=reorder(Sizes, n), y=n, fill=Sizes)) +
geom_bar(stat="identity", color="#e377c2", show.legend = FALSE) +
labs(title="Top 10 Most Common Sizes", x="Sizes", y="Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
guides(fill=guide_legend(title="Sizes"))
- The plots above show the sizing range of the brands within our
category, as the top performing sizing are either standard sizing or
following larger sizing clothing items.