library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(tidyr)
library(scales )
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
Our data has been cleaned from the previous project.
TASK1: CONVERT CATEGORICAL VARIABLES TO FACTORS
sales_details<-read.csv("/Users/mac/Downloads/Nike sales cleaned data.csv")
glimpse(sales_details)
## Rows: 2,500
## Columns: 12
## $ Order_ID <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
## $ Gender_Category <chr> "Kids", "Women", "Women", "Kids", "Kids", "Women", "M…
## $ Product_Line <chr> "Training", "Soccer", "Soccer", "Lifestyle", "Running…
## $ Product_Name <chr> "SuperRep Go", "Tiempo Legend", "Premier III", "Blaze…
## $ Size <chr> "M", "M", "M", "L", "XL", "M", "M", "M", "XI", "M", "…
## $ Units_Sold <int> 2, 3, 4, 2, 2, 1, 2, 1, 1, 4, 1, 1, 1, 2, 2, 2, 1, 4,…
## $ MRP <int> 6040, 4958, 6040, 9674, 6040, 7364, 6820, 6040, 6040,…
## $ Discount_Applied <dbl> 0.470, 0.615, 0.615, 0.615, 0.615, 0.615, 0.615, 0.32…
## $ Revenue <int> 6402, 5726, 9302, 7449, 4651, 2835, 5251, 4107, 2325,…
## $ Order_Date <chr> "2024-09-03", "2024-09-07", "2024-10-11", "2024-10-04…
## $ Sales_Channel <chr> "Online", "Retail", "Retail", "Online", "Retail", "Re…
## $ Region <chr> "Bengaluru", "Hyd", "Mumbai", "Pune", "Delhi", "Delhi…
colSums(is.na(sales_details))
## Order_ID Gender_Category Product_Line Product_Name
## 0 0 0 0
## Size Units_Sold MRP Discount_Applied
## 510 0 0 0
## Revenue Order_Date Sales_Channel Region
## 0 0 0 0
# convert categorical variables to factors
sales_details$Gender_Category <- as.factor(sales_details$Gender_Category)
sales_details$Product_Line <- as.factor(sales_details$Product_Line)
sales_details$Product_Name <- as.factor(sales_details$Product_Name)
sales_details$Size <- as.factor(sales_details$Size)
sales_details$Sales_Channel <- as.factor(sales_details$Sales_Channel)
sales_details$Region <- as.factor(sales_details$Region)
sales_details$Product_Line<-as.factor(sales_details$Product_Line)
EXPLORATORY DATA ANALYSIS
TASK2: Let’s find the summary statistics of Key numerical columns
sales_details %>%
summarise(
avg_units_sold = mean(Units_Sold, na.rm=TRUE),
avg_mrp = mean(MRP, na.rm=TRUE),
avg_discount = mean(Discount_Applied, na.rm=TRUE),
avg_revenue = mean(Revenue, na.rm=TRUE),
total_revenue = sum(Revenue, na.rm=TRUE),
total_units_sold = sum(Units_Sold, na.rm=TRUE)
)
## avg_units_sold avg_mrp avg_discount avg_revenue total_revenue
## 1 1.9916 6039.936 0.5425 5504.196 13760489
## total_units_sold
## 1 4979
ANALYZING THE HIGHEST ORDER IN VARIOUS SECTIONS USING HISTOGRAM
Orders by Units_Sold
ggplot(sales_details, aes(Units_Sold)) +
geom_histogram(bins = 30, fill="lightgreen", color="white") +
theme_minimal()
From the plot, most common order size is 2 units.
Orders by Gender Category
ggplot(sales_details, aes(Gender_Category)) +
geom_bar(fill="lightgreen") +
theme_minimal() +
labs(title="Orders by Gender Category")
From above, kids account for the highest number of orders.
Orders by Region
c<-ggplot(sales_details, aes(Region)) +
geom_bar(fill="lightgreen") +
theme_minimal() +
labs(title="Orders by Region")
ggsave("Delhi.png",plot=c)
## Saving 7 x 5 in image
From above, most orders originated from Delhi than any other region. This indicates that customers from Delhi are the most active contributing the largest share of total orders.
Orders by Channel
ggplot(sales_details, aes(Sales_Channel)) +
geom_bar(fill="lightgreen") +
theme_minimal() +
labs(title="Orders by Sales Channel")
From above, the online channel has the highest number of orders.This indicates that online channel is the most active channel in terms of orders.
TASK 3: TOP PRODUCT LINES
Let’s group Product Name by Product line
library(dplyr)
# Summarize units sold per Product Name within each Product Line
product_summary <- sales_details %>%
group_by(Product_Line, Product_Name) %>%
summarise(total_units = sum(Units_Sold, na.rm = TRUE),
total_revenue = sum(Revenue, na.rm = TRUE)) %>%
arrange(Product_Line, desc(total_units)) # highest units first per line
## `summarise()` has grouped output by 'Product_Line'. You can override using the
## `.groups` argument.
product_summary
## # A tibble: 20 × 4
## # Groups: Product_Line [5]
## Product_Line Product_Name total_units total_revenue
## <fct> <fct> <int> <int>
## 1 Basketball LeBron 20 278 811545
## 2 Basketball Air Jordan 265 767045
## 3 Basketball Zoom Freak 246 703635
## 4 Basketball Kyrie Flytrap 217 540874
## 5 Lifestyle Waffle One 279 742751
## 6 Lifestyle Blazer Mid 267 732817
## 7 Lifestyle Dunk Low 243 705320
## 8 Lifestyle Air Force 1 239 708072
## 9 Running React Infinity 275 679475
## 10 Running Free RN 240 661652
## 11 Running Air Zoom 206 563047
## 12 Running Pegasus Turbo 198 572598
## 13 Soccer Phantom GT 255 707336
## 14 Soccer Premier III 242 605251
## 15 Soccer Tiempo Legend 224 634785
## 16 Soccer Mercurial Superfly 217 604607
## 17 Training SuperRep Go 308 835423
## 18 Training Flex Trainer 282 782761
## 19 Training ZoomX Invincible 274 779115
## 20 Training Metcon 7 224 622380
TOTAL UNITS SOLD BY PRODUCT LINE
productline_units <- sales_details %>%
group_by(Product_Line) %>%
summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
arrange(desc(total_units))
ggplot(productline_units, aes(x = reorder(Product_Line, total_units), y = total_units)) +
geom_col(fill = "lightgreen") +
geom_text(aes(label = total_units), hjust = -0.05) +
coord_flip() +
theme_minimal() +
labs(title = "Total Units Sold by Product Line", x = "Product Line", y ="Total Units Sold") +
scale_y_continuous(expand = expansion(mult = c(0,0.25)))
From above, Training Product line is the highest units sold in Nike Sales.
TOTAL REVENUE BY PRODUCT LINE
productline_revenue <- sales_details %>%
group_by(Product_Line) %>%
summarise(total_revenue = sum(Revenue, na.rm = TRUE)) %>%
arrange(desc(total_revenue))
ggplot(productline_revenue, aes(x = reorder(Product_Line, total_revenue), y = total_revenue)) +
geom_col(fill = "lightgreen") +
geom_text(aes(label = paste0("#", format(total_revenue, big.mark=","))), hjust = -0.05) +
coord_flip() +
theme_minimal() +
labs(title = "Total Revenue by Product Line", x = "Product Line", y ="Total Revenue") +
scale_y_continuous(expand = expansion(mult = c(0,0.25)))
From above, Training product line generates the highest revenue for Nike Sales store.
UNITS SOLD BY PRODUCT LINE PER GENDER
ggplot(sales_details, aes(x = reorder(Product_Line, Units_Sold), y = Units_Sold, fill = Gender_Category)) +
geom_col(show.legend = FALSE) +
#geom_text(aes(label = Units_Sold), hjust = -0.05, size = 3) +
coord_flip() +
facet_wrap(~ Gender_Category, scales = "free_y") +
theme_minimal() +
labs(title = "Units Sold by Product Line per Gender", x ="Product Line" , y ="Gender" ) +
scale_y_continuous(expand = expansion(mult = c(0,0.25)))
Lifestyle and Training are the common product line amidst Kids Training
is the most common product line amidst Men Lifestyle and Training are
the most common product line amidst women.
UNITS SOLD BY PRODUCT LINE PER REGION
ggplot(sales_details, aes(x = reorder(Product_Line, Units_Sold), y = Units_Sold, fill =Region)) +
geom_col(show.legend = FALSE) +
#geom_text(aes(label = Units_Sold), hjust = 0.2, size = 2) +
coord_flip() +
facet_wrap(~Region, scales = "free_y") +
theme_minimal() +
labs(title = "Units Sold by Product Line per Region", x = "Product Line", y ="Region") +
scale_y_continuous(expand = expansion(mult = c(0,0.45)))
Training Product Line has the highest unit sold in Bangalore, Bengaluru, Delhi,Mumbai and Pune Lifestyle Product Line has the highest unit sold in Hyd and Hyderabad Basketball Product Line has the highest unit sold in Hyderbad and Kolkata
UNITS SOLD BY PRODUCT LINE PER SALES CHANNEL
ggplot(sales_details, aes(x = reorder(Product_Line, Units_Sold), y = Units_Sold, fill =Sales_Channel)) +
geom_col(show.legend = FALSE) +
#geom_text(aes(label = Units_Sold), hjust = -0.05, size = 4) +
coord_flip() +
facet_wrap(~Sales_Channel, scales = "free_y") +
theme_minimal() +
labs(title = "Units Sold by Product Line per Sales Channel", x = "Product Line", y = "Sales Channel") +
scale_y_continuous(expand = expansion(mult = c(0,0.45)))
Training has the highest order online Basketball has the highest order in Retail shops.
Insights: Training and Lifestyle Product Line is the most popular Product Line
TASK4: TOP PRODUCTS NAMES
GET THE TOP TEN PRODUCTS BY REVENUE
# Summarize top 10 products by revenue
top_products <- sales_details %>%
group_by(Product_Name) %>%
summarise(total_revenue = sum(Revenue, na.rm = TRUE)) %>%
arrange(desc(total_revenue)) %>%
slice_head(n = 10)
top_products
## # A tibble: 10 × 2
## Product_Name total_revenue
## <fct> <int>
## 1 SuperRep Go 835423
## 2 LeBron 20 811545
## 3 Flex Trainer 782761
## 4 ZoomX Invincible 779115
## 5 Air Jordan 767045
## 6 Waffle One 742751
## 7 Blazer Mid 732817
## 8 Air Force 1 708072
## 9 Phantom GT 707336
## 10 Dunk Low 705320
# Horizontal bar chart without numeric x-axis
ggplot(top_products, aes(x = reorder(Product_Name, total_revenue), y = total_revenue)) +
geom_col(fill = "lightgreen") +
geom_text(aes(label = paste0("#", format(total_revenue, big.mark = ","))),
hjust = -0.05, size = 4) +
coord_flip() +
theme_minimal() +
labs(
title = "Top 10 Products by Revenue",
x = "Product Name", # omit x-axis label
y = "Total Revenue" # y-axis label
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.25)))
From above, SuperRep Go generated the highest revenue among all products. This indicates that it is the best performing product, contributing the largest share to total sales.
Recommendation:
Nike Company should consider promoting this product(SuperRep Go) futher.
TOP 10 products by Unit Sold
# Top 10 products by units sold
top_products_units <- sales_details %>%
group_by(Product_Name) %>%
summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
arrange(desc(total_units)) %>%
slice_head(n = 10)
# Plot
ggplot(top_products_units, aes(x = reorder(Product_Name, total_units), y = total_units)) +
geom_col(fill = "lightgreen") +
geom_text(aes(label = total_units), hjust = -0.05, size = 4) +
coord_flip() +
theme_minimal() +
labs(
title = "Top 10 Products by Units Sold",
x = "Product Name",
y = "Total Units Sold"
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.25)))
From above, SuperRep Go Product was purchased in the largest Quantity.
TOP 3 PRODUCTS BY UNITS SOLD PER GENDER CATEGORY
top_products_gender <- sales_details %>%
group_by(Gender_Category, Product_Name) %>%
summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
arrange(Gender_Category, desc(total_units)) %>%
slice_head(n = 3)
## `summarise()` has grouped output by 'Gender_Category'. You can override using
## the `.groups` argument.
# Plot with facets by gender
ggplot(top_products_gender, aes(x = reorder(Product_Name, total_units), y = total_units, fill = Gender_Category)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = total_units), hjust = -0.05, size = 4) +
coord_flip() +
facet_wrap(~ Gender_Category, scales = "free_y") +
theme_minimal() +
labs(
title = "Top 3 Products by Units Sold per Gender",
x ="Product Name",
y = "Total Units Sold"
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.45)))
From above, Blazer Mid is most popular amidst Kids
Flex Trainer is most popular amidst Men
Zoomx Invincible is most popular amidst Women
Recommendation
From the order by gender analysis, Kids have the highest number of orders, I will advise Nike Sales to get more Blazer Mid in their storeto increase orders in the store.
TOP 3 PRODUCTS BY UNITS SOLD PER REGION
top_products_region <- sales_details %>%
group_by(Region, Product_Name) %>%
summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
arrange(Region, desc(total_units)) %>%
slice_head(n = 3)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
# Plot with facets by region
ggplot(top_products_region, aes(x = reorder(Product_Name, total_units), y = total_units, fill = Region)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = total_units), hjust = -0.05, size = 4) +
coord_flip() +
facet_wrap(~ Region, scales = "free_y") +
theme_minimal() +
labs(
title = "Top 3 Products by Units Sold per Region",
x = "Product Name",
y = "Total Units Sold"
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.45)))
SuperRep Go is popular in Delhi,Mumbai and Pune
Metcon 7 is popular in Bangalore and Bengaluru
Waffle One is popular in Hyd.
Dunk Low is popular in Hyderabad
Air Jordan is popular in Hyderbad
Lebron 20 is popular in Kolkata
TOP 3 PRODUCTS BY UNITS SOLD PER SALES CHANNEL
top_products_channel <- sales_details %>%
group_by(Sales_Channel, Product_Name) %>%
summarise(total_units = sum(Units_Sold, na.rm = TRUE)) %>%
arrange(Sales_Channel, desc(total_units)) %>%
slice_head(n = 3)
## `summarise()` has grouped output by 'Sales_Channel'. You can override using the
## `.groups` argument.
# Plot with facets by sales channel
ggplot(top_products_channel, aes(x = reorder(Product_Name, total_units), y = total_units, fill = Sales_Channel)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = total_units), hjust = -0.05, size = 4) +
coord_flip() +
facet_wrap(~ Sales_Channel, scales = "free_y") +
theme_minimal() +
labs(
title = "Top 3 Products by Units Sold per Sales Channel",
x = "Product Name",
y = "Total Units Sold"
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.45)))
ZoomX invincible has the highest order online while SuperRep Go has the highest order in retail stores.
Recommendation: Since Kids order more, Blazer Mild should be stocked in stores and made available online
SuperRep Go is very popular across all regions, channels, and genders. Nike should feature it as a flagship product in stores and promotions.
Flex Trainer should be stocked as it is popular amidst Men ZoomX Invincible should also be stocked as it is popular amidst Women.
Most popular product falls under Training and Lifestyle
Products Recommended for Nike’s 2026 Inventory Strategy
Training: SuperRep Go, Metcon 7, ZoomX Invincible Lifestyle: Blazer Mid and Waffle One Basketball: Air Jordan and Lebron 20
Also,since online channel has the highest number of orders, Nike should employ more people in the retail stores to enhance more sales.
SuperRep Go is highly recommended as it generates the highest revenue and is also most common in Delhi region where most of the orders generated from.
Kids has the highest order; stock the stores more with Blazer Mid