This synthetic dataset belongs to one of the World’s leading sportswear and footwear brands, Nike. It contains:

AIM

Determine Top Products to stock in 2026.

QUESTIONS

OBJECTIVES

Step1: Load the Necessary Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(tidyr)
library(scales )
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Step2:Read in the dataset

sales_details<-read.csv("/Users/mac/Downloads/Nike sales cleaned data.csv")
glimpse(sales_details)
## Rows: 2,500
## Columns: 12
## $ Order_ID         <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
## $ Gender_Category  <chr> "Kids", "Women", "Women", "Kids", "Kids", "Women", "M…
## $ Product_Line     <chr> "Training", "Soccer", "Soccer", "Lifestyle", "Running…
## $ Product_Name     <chr> "SuperRep Go", "Tiempo Legend", "Premier III", "Blaze…
## $ Size             <chr> "M", "M", "M", "L", "XL", "M", "M", "M", "XI", "M", "…
## $ Units_Sold       <int> 2, 3, 4, 2, 2, 1, 2, 1, 1, 4, 1, 1, 1, 2, 2, 2, 1, 4,…
## $ MRP              <int> 6040, 4958, 6040, 9674, 6040, 7364, 6820, 6040, 6040,…
## $ Discount_Applied <dbl> 0.470, 0.615, 0.615, 0.615, 0.615, 0.615, 0.615, 0.32…
## $ Revenue          <int> 6402, 5726, 9302, 7449, 4651, 2835, 5251, 4107, 2325,…
## $ Order_Date       <chr> "2024-09-03", "2024-09-07", "2024-10-11", "2024-10-04…
## $ Sales_Channel    <chr> "Online", "Retail", "Retail", "Online", "Retail", "Re…
## $ Region           <chr> "Bengaluru", "Hyd", "Mumbai", "Pune", "Delhi", "Delhi…
colSums(is.na(sales_details))
##         Order_ID  Gender_Category     Product_Line     Product_Name 
##                0                0                0                0 
##             Size       Units_Sold              MRP Discount_Applied 
##              510                0                0                0 
##          Revenue       Order_Date    Sales_Channel           Region 
##                0                0                0                0

Step3: Convert Categorical variables to factors

sales_details$Gender_Category <- as.factor(sales_details$Gender_Category)
glimpse(sales_details$Gender_Category)
##  Factor w/ 3 levels "Kids","Men","Women": 1 3 3 1 1 3 2 1 3 2 ...
sales_details$Product_Line <- as.factor(sales_details$Product_Line)
glimpse(sales_details$Product_Line)
##  Factor w/ 5 levels "Basketball","Lifestyle",..: 5 4 4 2 3 5 5 5 4 4 ...
sales_details$Product_Name <- as.factor(sales_details$Product_Name)
glimpse(sales_details$Product_Name)
##  Factor w/ 20 levels "Air Force 1",..: 16 17 14 4 15 6 16 16 14 10 ...
sales_details$Size <- as.factor(sales_details$Size)
glimpse(sales_details$Size)
##  Factor w/ 10 levels "IX","L","M","VI",..: 3 3 3 2 10 3 3 3 8 3 ...
sales_details$Sales_Channel <- as.factor(sales_details$Sales_Channel)
glimpse(sales_details$Sales_Channel)
##  Factor w/ 2 levels "Online","Retail": 1 2 2 1 2 2 1 2 1 1 ...
sales_details$Region <- as.factor(sales_details$Region)
glimpse(sales_details$Region)
##  Factor w/ 9 levels "Bangalore","Bengaluru",..: 2 4 8 9 3 3 1 1 5 9 ...
sales_details$Product_Line<-as.factor(sales_details$Product_Line)
glimpse(sales_details$Product_Line)
##  Factor w/ 5 levels "Basketball","Lifestyle",..: 5 4 4 2 3 5 5 5 4 4 ...

Step4:Check the orders by Gender_Category

ggplot(sales_details, aes(Gender_Category)) +
  geom_bar(fill="lightgreen") +
  theme_minimal() +
  labs(title="Orders by Gender Category")

table(sales_details$Gender_Category)
## 
##  Kids   Men Women 
##   841   840   819

From above, kids and Men accounts for the highest orders with just kids leading with just one order.

Step5:Check Orders by Region

ggplot(sales_details, aes(Region)) +
  geom_bar(fill="lightgreen") +
  theme_minimal() +
  labs(title="Orders by Region")

table(sales_details$Region)
## 
## Bangalore Bengaluru     Delhi       Hyd Hyderabad  Hyderbad   Kolkata    Mumbai 
##       216       220       438       136       126       141       417       418 
##      Pune 
##       388

From above, most orders originated from Delhi than any other region. This indicates that customers from Delhi are the most active contributing the largest share of total orders.

Step6: Check Orders by Channel

ggplot(sales_details, aes(Sales_Channel)) +
  geom_bar(fill="lightgreen") +
  theme_minimal() +
  labs(title="Orders by Sales Channel")

table(sales_details$Sales_Channel)
## 
## Online Retail 
##   1255   1245

From above, the online channel has the highest number of orders leading with ten orders.

Step7:Group Product Name by Product line

library(dplyr)

# Summarize units sold per Product Name within each Product Line
product_summary <- sales_details %>%
  group_by(Product_Line, Product_Name) %>%
  summarise(total_units = sum(Units_Sold, na.rm = TRUE),
            total_revenue = sum(Revenue, na.rm = TRUE)) %>%
  arrange(Product_Line, desc(total_units))  # highest units first per line
## `summarise()` has grouped output by 'Product_Line'. You can override using the
## `.groups` argument.
product_summary
## # A tibble: 20 × 4
## # Groups:   Product_Line [5]
##    Product_Line Product_Name       total_units total_revenue
##    <fct>        <fct>                    <int>         <int>
##  1 Basketball   LeBron 20                  278        811545
##  2 Basketball   Air Jordan                 265        767045
##  3 Basketball   Zoom Freak                 246        703635
##  4 Basketball   Kyrie Flytrap              217        540874
##  5 Lifestyle    Waffle One                 279        742751
##  6 Lifestyle    Blazer Mid                 267        732817
##  7 Lifestyle    Dunk Low                   243        705320
##  8 Lifestyle    Air Force 1                239        708072
##  9 Running      React Infinity             275        679475
## 10 Running      Free RN                    240        661652
## 11 Running      Air Zoom                   206        563047
## 12 Running      Pegasus Turbo              198        572598
## 13 Soccer       Phantom GT                 255        707336
## 14 Soccer       Premier III                242        605251
## 15 Soccer       Tiempo Legend              224        634785
## 16 Soccer       Mercurial Superfly         217        604607
## 17 Training     SuperRep Go                308        835423
## 18 Training     Flex Trainer               282        782761
## 19 Training     ZoomX Invincible           274        779115
## 20 Training     Metcon 7                   224        622380

I grouped Product Name by Product Line so as to identify which Product Name falls under Product Line and also to know the total units of each product sold out with the revenue generated.

Step8: Check Total Units sold by Product Line.

productline_units <- sales_details %>%
  group_by(Product_Line) %>%
  summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
  arrange(desc(total_units))
productline_units
## # A tibble: 5 × 2
##   Product_Line total_units
##   <fct>              <int>
## 1 Training            1088
## 2 Lifestyle           1028
## 3 Basketball          1006
## 4 Soccer               938
## 5 Running              919
ggplot(productline_units, aes(x = reorder(Product_Line, total_units), y = total_units)) +
  geom_col(fill = "lightgreen") +
  geom_text(aes(label = total_units), hjust = -0.05) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Total Units Sold by Product Line", x = "Product Line", y ="Total Units Sold") +
  scale_y_continuous(expand = expansion(mult = c(0,0.25)))

From above, Training Product line is the highest units sold.

Step9: Check Total Revenue generated by Product Line

productline_revenue <- sales_details %>%
  group_by(Product_Line) %>%
  summarise(total_revenue = sum(Revenue, na.rm = TRUE)) %>%
  arrange(desc(total_revenue))
productline_revenue
## # A tibble: 5 × 2
##   Product_Line total_revenue
##   <fct>                <int>
## 1 Training           3019679
## 2 Lifestyle          2888960
## 3 Basketball         2823099
## 4 Soccer             2551979
## 5 Running            2476772
ggplot(productline_revenue, aes(x = reorder(Product_Line, total_revenue), y = total_revenue)) +
  geom_col(fill = "lightgreen") +
  geom_text(aes(label = paste0("#", format(total_revenue, big.mark=","))), hjust = -0.05) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Total Revenue Generated by Product Line", x = "Product Line", y ="Total Revenue") +
  scale_y_continuous(expand = expansion(mult = c(0,0.25)))

From above, Training product line generates the highest revenue for Nike Sales store.

Step 10: Check Total Units Sold by Product Line per Gender

table <- sales_details%>%
  group_by(Product_Line, Gender_Category) %>%
  summarise(TotalUnitsSold = sum(Units_Sold, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Product_Line'. You can override using the
## `.groups` argument.
table
## # A tibble: 15 × 3
##    Product_Line Gender_Category TotalUnitsSold
##    <fct>        <fct>                    <int>
##  1 Basketball   Kids                       338
##  2 Basketball   Men                        351
##  3 Basketball   Women                      317
##  4 Lifestyle    Kids                       352
##  5 Lifestyle    Men                        331
##  6 Lifestyle    Women                      345
##  7 Running      Kids                       319
##  8 Running      Men                        283
##  9 Running      Women                      317
## 10 Soccer       Kids                       310
## 11 Soccer       Men                        330
## 12 Soccer       Women                      298
## 13 Training     Kids                       355
## 14 Training     Men                        385
## 15 Training     Women                      348
ggplot(table, aes(x = reorder(Product_Line, TotalUnitsSold), y = TotalUnitsSold, fill = Gender_Category)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = TotalUnitsSold), hjust = -0.05, size = 3) +
  coord_flip() +
  facet_wrap(~ Gender_Category, scales = "free_y") +
  theme_minimal() +
  labs(title = "Total Units Sold by Product Line per Gender", x ="Product Line" , y ="Total Units" ) +
  scale_y_continuous(expand = expansion(mult = c(0,0.25)))

From above, Training is more popular amidst Kids, Men and even Women.

Step 11:Check Total Units Sold by Product Line Per Region

reg <- sales_details%>%
  group_by(Product_Line, Region) %>%
  summarise(TotalUnitsSold = sum(Units_Sold, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Product_Line'. You can override using the
## `.groups` argument.
reg
## # A tibble: 45 × 3
##    Product_Line Region    TotalUnitsSold
##    <fct>        <fct>              <int>
##  1 Basketball   Bangalore             74
##  2 Basketball   Bengaluru             77
##  3 Basketball   Delhi                181
##  4 Basketball   Hyd                   57
##  5 Basketball   Hyderabad             45
##  6 Basketball   Hyderbad              74
##  7 Basketball   Kolkata              180
##  8 Basketball   Mumbai               176
##  9 Basketball   Pune                 142
## 10 Lifestyle    Bangalore             84
## # ℹ 35 more rows
ggplot(reg, aes(x = reorder(Product_Line, TotalUnitsSold), y = TotalUnitsSold, fill =Region)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = TotalUnitsSold), hjust = -0.05, size = 3) +
  coord_flip() +
  facet_wrap(~Region, scales = "free_y") +
  theme_minimal() +
  labs(title = "Total Units Sold by Product Line per Region", x = "Product Line", y ="Total Units Sold") +
  scale_y_continuous(expand = expansion(mult = c(0,0.45)))

Training Product Line has the highest units sold in Bangalore, Bengaluru, Delhi,Mumbai and Pune.

Lifestyle Product Line has the highest units sold in Hyd and Hyderabad.

Basketball Product Line has the highest unit sold in Hyderbad and Kolkata.

Step12: Check Total Units Sold by Product Line Per Sales Channel

sal <- sales_details%>%
  group_by(Product_Line, Sales_Channel) %>%
  summarise(TotalUnitsSold = sum(Units_Sold, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Product_Line'. You can override using the
## `.groups` argument.
sal
## # A tibble: 10 × 3
##    Product_Line Sales_Channel TotalUnitsSold
##    <fct>        <fct>                  <int>
##  1 Basketball   Online                   488
##  2 Basketball   Retail                   518
##  3 Lifestyle    Online                   518
##  4 Lifestyle    Retail                   510
##  5 Running      Online                   473
##  6 Running      Retail                   446
##  7 Soccer       Online                   473
##  8 Soccer       Retail                   465
##  9 Training     Online                   572
## 10 Training     Retail                   516
table
## # A tibble: 15 × 3
##    Product_Line Gender_Category TotalUnitsSold
##    <fct>        <fct>                    <int>
##  1 Basketball   Kids                       338
##  2 Basketball   Men                        351
##  3 Basketball   Women                      317
##  4 Lifestyle    Kids                       352
##  5 Lifestyle    Men                        331
##  6 Lifestyle    Women                      345
##  7 Running      Kids                       319
##  8 Running      Men                        283
##  9 Running      Women                      317
## 10 Soccer       Kids                       310
## 11 Soccer       Men                        330
## 12 Soccer       Women                      298
## 13 Training     Kids                       355
## 14 Training     Men                        385
## 15 Training     Women                      348
ggplot(sal, aes(x = reorder(Product_Line, TotalUnitsSold), y = TotalUnitsSold, fill =Sales_Channel)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = TotalUnitsSold), hjust = -0.05, size = 4) +
  coord_flip() +
  facet_wrap(~Sales_Channel, scales = "free_y") +
  theme_minimal() +
  labs(title = "Total Units Sold by Product Line per Sales Channel", x = "Product Line", y = "Total Units Sold") +
  scale_y_continuous(expand = expansion(mult = c(0,0.45)))

Training has the highest orders online.

Basketball has the highest orders in Retail shops.

Step13: Check the Top Ten Products by Revenue

# Summarize top 10 products by revenue
top_products <- sales_details %>% 
  group_by(Product_Name) %>% 
  summarise(total_revenue = sum(Revenue, na.rm = TRUE)) %>% 
  arrange(desc(total_revenue)) %>% 
  slice_head(n = 10)
top_products
## # A tibble: 10 × 2
##    Product_Name     total_revenue
##    <fct>                    <int>
##  1 SuperRep Go             835423
##  2 LeBron 20               811545
##  3 Flex Trainer            782761
##  4 ZoomX Invincible        779115
##  5 Air Jordan              767045
##  6 Waffle One              742751
##  7 Blazer Mid              732817
##  8 Air Force 1             708072
##  9 Phantom GT              707336
## 10 Dunk Low                705320
# Horizontal bar chart without numeric x-axis
ggplot(top_products, aes(x = reorder(Product_Name, total_revenue), y = total_revenue)) +
  geom_col(fill = "lightgreen") +
  geom_text(aes(label = paste0("#", format(total_revenue, big.mark = ","))),
            hjust = -0.05, size = 4) +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Top 10 Products by Revenue",
    x = "Product Name",               # omit x-axis label
    y = "Total Revenue"      # y-axis label
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.25))) 

From above, SuperRep Go generated the highest revenue among all products.This indicates that it is the best performing product, contributing the largest share to total sales.

Step 14: Check the TOP 10 products by Units Sold

# Top 10 products by units sold
top_products_units <- sales_details %>%
  group_by(Product_Name) %>%
  summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
  arrange(desc(total_units)) %>%
  slice_head(n = 10)
top_products_units
## # A tibble: 10 × 2
##    Product_Name     total_units
##    <fct>                  <int>
##  1 SuperRep Go              308
##  2 Flex Trainer             282
##  3 Waffle One               279
##  4 LeBron 20                278
##  5 React Infinity           275
##  6 ZoomX Invincible         274
##  7 Blazer Mid               267
##  8 Air Jordan               265
##  9 Phantom GT               255
## 10 Zoom Freak               246
# Plot
ggplot(top_products_units, aes(x = reorder(Product_Name, total_units), y = total_units)) +
  geom_col(fill = "lightgreen") +
  geom_text(aes(label = total_units), hjust = -0.05, size = 4) +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Top 10 Products by Units Sold",
    x = "Product Name",
    y = "Total Units Sold"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.25)))

From above, SuperRep Go Product was purchased in the largest Quantity.

Step 15: Check the TOP 3 Products by Units Sold per Gender Category

top_products_gender <- sales_details %>%
  group_by(Gender_Category, Product_Name) %>%
  summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
  arrange(Gender_Category, desc(total_units)) %>%
  slice_head(n = 3)
## `summarise()` has grouped output by 'Gender_Category'. You can override using
## the `.groups` argument.
top_products_gender
## # A tibble: 9 × 3
## # Groups:   Gender_Category [3]
##   Gender_Category Product_Name     total_units
##   <fct>           <fct>                  <int>
## 1 Kids            Blazer Mid               107
## 2 Kids            SuperRep Go              103
## 3 Kids            Waffle One                99
## 4 Men             Flex Trainer             111
## 5 Men             Air Jordan               109
## 6 Men             Phantom GT               103
## 7 Women           ZoomX Invincible         108
## 8 Women           React Infinity           103
## 9 Women           LeBron 20                102
# Plot with facets by gender
ggplot(top_products_gender, aes(x = reorder(Product_Name, total_units), y = total_units, fill = Gender_Category)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = total_units), hjust = -0.05, size = 4) +
  coord_flip() +
  facet_wrap(~ Gender_Category, scales = "free_y") +
  theme_minimal() +
  labs(
    title = "Top 3 Products by Units Sold per Gender",
    x ="Product Name",
    y = "Total Units Sold"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.45)))

From above, Blazer Mid is most popular amidst Kids

Flex Trainer is most popular amidst Men

Zoomx Invincible is most popular amidst Women.

Step16:Check the TOP 3 Products by Units Sold Per Region

top_products_region <- sales_details %>%
  group_by(Region, Product_Name) %>%
  summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
  arrange(Region, desc(total_units)) %>%
  slice_head(n = 3)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
top_products_region
## # A tibble: 27 × 3
## # Groups:   Region [9]
##    Region    Product_Name   total_units
##    <fct>     <fct>                <int>
##  1 Bangalore Metcon 7                32
##  2 Bangalore Flex Trainer            29
##  3 Bangalore Blazer Mid              27
##  4 Bengaluru Metcon 7                44
##  5 Bengaluru Waffle One              33
##  6 Bengaluru React Infinity          32
##  7 Delhi     SuperRep Go             74
##  8 Delhi     Air Jordan              61
##  9 Delhi     Premier III             60
## 10 Hyd       Waffle One              22
## # ℹ 17 more rows
# Plot with facets by region
ggplot(top_products_region, aes(x = reorder(Product_Name, total_units), y = total_units, fill = Region)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = total_units), hjust = -0.05, size = 4) +
  coord_flip() +
  facet_wrap(~ Region, scales = "free_y") +
  theme_minimal() +
  labs(
    title = "Top 3 Products by Units Sold per Region",
    x = "Product Name",
    y = "Total Units Sold"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.45)))

SuperRep Go is more popular in Delhi,Mumbai and Pune

Metcon 7 is more popular in Bangalore and Bengaluru

Waffle One is more popular in Hyd.

Dunk Low is more popular in Hyderabad

Air Jordan is more popular in Hyderbad

Lebron 20 is more popular in Kolkata

Step 17: Check Top 3 Products by UNITS SOLD PER SALES CHANNEL

top_products_channel <- sales_details %>%
  group_by(Sales_Channel, Product_Name) %>%
  summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
  arrange(Sales_Channel, desc(total_units)) %>%
  slice_head(n = 3)
## `summarise()` has grouped output by 'Sales_Channel'. You can override using the
## `.groups` argument.
# Plot with facets by sales channel
ggplot(top_products_channel, aes(x = reorder(Product_Name, total_units), y = total_units, fill = Sales_Channel)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = total_units), hjust = -0.05, size = 4) +
  coord_flip() +
  facet_wrap(~ Sales_Channel, scales = "free_y") +
  theme_minimal() +
  labs(
    title = "Top 3 Products by Units Sold per Sales Channel",
    x = "Product Name",
    y = "Total Units Sold"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.45)))

ZoomX invincible has the highest order online with just a margin over SuperRep Go SuperRep Go has the highest order in retail stores leading with just four orders.

FINDINGS

CONCLUSION AND DISCUSSION