#Introduction
This analysis explores customer behavior and product performance using a retail dataset. The dataset was gotten from Kaggle at “https://www.kaggle.com/datasets/nalisha/shopping-behaviour-and-product-ranking-dateset”. The goal is to draw insight from the dataset on which age bracket makes the most purchases, the season with the highest sales volume, the location with highest sales volume and general customer behavioral pattern to help improve sales and customer rating.
library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.4 v readr 2.1.6
## v forcats 1.0.1 v stringr 1.6.0
## v ggplot2 4.0.1 v tibble 3.3.0
## v lubridate 1.9.4 v tidyr 1.3.2
## v purrr 1.2.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lubridate)
library(ggplot2)
library(dplyr)
# Dataset identifier
dataset <- "nalisha/shopping-behaviour-and-product-ranking-dateset"
# Create data directory if it doesn't exist
if (!dir.exists("../data")) {
dir.create("../data")
}
# Download only if file does not already exist
if (!file.exists("../data/shopping_behavior_updated.csv")) {
system(
paste(
"kaggle datasets download",
dataset,
"-p ../data --unzip"
)
)
}
## [1] 0
list.files("../data")
## [1] "shopping_behavior_updated (1).csv"
customer_data <- read_csv("../data/shopping_behavior_updated (1).csv")
## Rows: 3900 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): Gender, Item Purchased, Category, Location, Size, Color, Season, S...
## dbl (5): Customer ID, Age, Purchase Amount (USD), Review Rating, Previous P...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(customer_data)
## # A tibble: 6 x 16
## `Customer ID` Age Gender `Item Purchased` Category `Purchase Amount (USD)`
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 1 55 Male Blouse Clothing 53
## 2 2 19 Male Sweater Clothing 64
## 3 3 50 Male Jeans Clothing 73
## 4 4 21 Male Sandals Footwear 90
## 5 5 45 Male Blouse Clothing 49
## 6 6 46 Male Sneakers Footwear 20
## # i 10 more variables: Location <chr>, Size <chr>, Color <chr>, Season <chr>,
## # `Review Rating` <dbl>, `Subscription Status` <chr>,
## # `Discount Applied` <chr>, `Previous Purchases` <dbl>,
## # `Payment Method` <chr>, `Frequency of Purchases` <chr>
data <- clean_names(customer_data)
head(data)
## # A tibble: 6 x 16
## customer_id age gender item_purchased category purchase_amount_usd location
## <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 1 55 Male Blouse Clothing 53 Kentucky
## 2 2 19 Male Sweater Clothing 64 Maine
## 3 3 50 Male Jeans Clothing 73 Massachu~
## 4 4 21 Male Sandals Footwear 90 Rhode Is~
## 5 5 45 Male Blouse Clothing 49 Oregon
## 6 6 46 Male Sneakers Footwear 20 Wyoming
## # i 9 more variables: size <chr>, color <chr>, season <chr>,
## # review_rating <dbl>, subscription_status <chr>, discount_applied <chr>,
## # previous_purchases <dbl>, payment_method <chr>,
## # frequency_of_purchases <chr>
glimpse(data)
## Rows: 3,900
## Columns: 16
## $ customer_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ~
## $ age <dbl> 55, 19, 50, 21, 45, 46, 63, 27, 26, 57, 53, 30,~
## $ gender <chr> "Male", "Male", "Male", "Male", "Male", "Male",~
## $ item_purchased <chr> "Blouse", "Sweater", "Jeans", "Sandals", "Blous~
## $ category <chr> "Clothing", "Clothing", "Clothing", "Footwear",~
## $ purchase_amount_usd <dbl> 53, 64, 73, 90, 49, 20, 85, 34, 97, 31, 34, 68,~
## $ location <chr> "Kentucky", "Maine", "Massachusetts", "Rhode Is~
## $ size <chr> "L", "L", "S", "M", "M", "M", "M", "L", "L", "M~
## $ color <chr> "Gray", "Maroon", "Maroon", "Maroon", "Turquois~
## $ season <chr> "Winter", "Winter", "Spring", "Spring", "Spring~
## $ review_rating <dbl> 3.1, 3.1, 3.1, 3.5, 2.7, 2.9, 3.2, 3.2, 2.6, 4.~
## $ subscription_status <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"~
## $ discount_applied <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"~
## $ previous_purchases <dbl> 14, 2, 23, 49, 31, 14, 49, 19, 8, 4, 26, 10, 37~
## $ payment_method <chr> "Venmo", "Cash", "Credit Card", "PayPal", "PayP~
## $ frequency_of_purchases <chr> "Fortnightly", "Fortnightly", "Weekly", "Weekly~
str(data)
## spc_tbl_ [3,900 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ customer_id : num [1:3900] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:3900] 55 19 50 21 45 46 63 27 26 57 ...
## $ gender : chr [1:3900] "Male" "Male" "Male" "Male" ...
## $ item_purchased : chr [1:3900] "Blouse" "Sweater" "Jeans" "Sandals" ...
## $ category : chr [1:3900] "Clothing" "Clothing" "Clothing" "Footwear" ...
## $ purchase_amount_usd : num [1:3900] 53 64 73 90 49 20 85 34 97 31 ...
## $ location : chr [1:3900] "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
## $ size : chr [1:3900] "L" "L" "S" "M" ...
## $ color : chr [1:3900] "Gray" "Maroon" "Maroon" "Maroon" ...
## $ season : chr [1:3900] "Winter" "Winter" "Spring" "Spring" ...
## $ review_rating : num [1:3900] 3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
## $ subscription_status : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
## $ discount_applied : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
## $ previous_purchases : num [1:3900] 14 2 23 49 31 14 49 19 8 4 ...
## $ payment_method : chr [1:3900] "Venmo" "Cash" "Credit Card" "PayPal" ...
## $ frequency_of_purchases: chr [1:3900] "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
## - attr(*, "spec")=
## .. cols(
## .. `Customer ID` = col_double(),
## .. Age = col_double(),
## .. Gender = col_character(),
## .. `Item Purchased` = col_character(),
## .. Category = col_character(),
## .. `Purchase Amount (USD)` = col_double(),
## .. Location = col_character(),
## .. Size = col_character(),
## .. Color = col_character(),
## .. Season = col_character(),
## .. `Review Rating` = col_double(),
## .. `Subscription Status` = col_character(),
## .. `Discount Applied` = col_character(),
## .. `Previous Purchases` = col_double(),
## .. `Payment Method` = col_character(),
## .. `Frequency of Purchases` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(data)
## customer_id age gender item_purchased
## Min. : 1.0 Min. :18.00 Length:3900 Length:3900
## 1st Qu.: 975.8 1st Qu.:31.00 Class :character Class :character
## Median :1950.5 Median :44.00 Mode :character Mode :character
## Mean :1950.5 Mean :44.07
## 3rd Qu.:2925.2 3rd Qu.:57.00
## Max. :3900.0 Max. :70.00
## category purchase_amount_usd location size
## Length:3900 Min. : 20.00 Length:3900 Length:3900
## Class :character 1st Qu.: 39.00 Class :character Class :character
## Mode :character Median : 60.00 Mode :character Mode :character
## Mean : 59.76
## 3rd Qu.: 81.00
## Max. :100.00
## color season review_rating subscription_status
## Length:3900 Length:3900 Min. :2.50 Length:3900
## Class :character Class :character 1st Qu.:3.10 Class :character
## Mode :character Mode :character Median :3.70 Mode :character
## Mean :3.75
## 3rd Qu.:4.40
## Max. :5.00
## discount_applied previous_purchases payment_method
## Length:3900 Min. : 1.00 Length:3900
## Class :character 1st Qu.:13.00 Class :character
## Mode :character Median :25.00 Mode :character
## Mean :25.35
## 3rd Qu.:38.00
## Max. :50.00
## frequency_of_purchases
## Length:3900
## Class :character
## Mode :character
##
##
##
sum(is.na(data))
## [1] 0
sum(duplicated(data))
## [1] 0
data <- data %>%
mutate(age_group = case_when(
age < 18 ~ "Under 18",
age >= 18 & age <= 25 ~ "18–25",
age >= 26 & age <= 35 ~ "26–35",
age >= 36 & age <= 45 ~ "36–45",
age >= 46 & age <= 60 ~ "46–60",
TRUE ~ "60+"
))
age_sales <- data %>%
group_by(age_group) %>%
summarise(
total_purchases = n(),
total_sales = sum(purchase_amount_usd, na.rm = TRUE)
) %>%
arrange(desc(total_purchases))
age_sales
## # A tibble: 5 x 3
## age_group total_purchases total_sales
## <chr> <int> <dbl>
## 1 46–60 1135 67711
## 2 26–35 742 44342
## 3 36–45 729 43234
## 4 60+ 723 43164
## 5 18–25 571 34630
age_sales %>%
ggplot(aes(x = reorder(age_group, -total_purchases), y = total_purchases)) +
geom_col(fill = "steelblue") +
labs(title = "Purchases by Age Group",
x = "Age Group",
y = "Number of Purchases")
season_sales <- data %>%
group_by(season) %>%
summarise(
total_purchases = n(),
total_sales = sum(purchase_amount_usd, na.rm = TRUE)
) %>%
arrange(desc(total_sales))
season_sales
## # A tibble: 4 x 3
## season total_purchases total_sales
## <chr> <int> <dbl>
## 1 Fall 975 60018
## 2 Spring 999 58679
## 3 Winter 971 58607
## 4 Summer 955 55777
season_sales %>%
ggplot(aes(x = reorder(season, -total_sales), y = total_sales)) +
geom_col(fill = "darkgreen") +
labs(title = "Total Sales by Season",
x = "Season",
y = "Total Sales (USD)")
location_sales <- data %>%
group_by(location) %>%
summarise(
total_purchases = n(),
total_sales = sum(purchase_amount_usd, na.rm = TRUE)
) %>%
arrange(desc(total_sales))
location_sales
## # A tibble: 50 x 3
## location total_purchases total_sales
## <chr> <int> <dbl>
## 1 Montana 96 5784
## 2 Illinois 92 5617
## 3 California 95 5605
## 4 Idaho 93 5587
## 5 Nevada 87 5514
## 6 Alabama 89 5261
## 7 New York 87 5257
## 8 North Dakota 83 5220
## 9 West Virginia 81 5174
## 10 Nebraska 87 5172
## # i 40 more rows
location_sales %>%
top_n(10, total_sales) %>%
ggplot(aes(x = reorder(location, total_sales), y = total_sales)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Top 10 Locations by Sales",
x = "Location",
y = "Total Sales (USD)")
top_category_per_season <- data %>%
group_by(season, item_purchased) %>%
summarise(
total_sales = sum(purchase_amount_usd, na.rm = TRUE),
.groups = "drop"
) %>%
group_by(season) %>%
slice_max(total_sales, n = 1)
top_category_per_season
## # A tibble: 4 x 3
## # Groups: season [4]
## season item_purchased total_sales
## <chr> <chr> <dbl>
## 1 Fall Hat 3224
## 2 Spring Sweater 3145
## 3 Summer Jewelry 3006
## 4 Winter Shirt 3102
top_category_per_season %>%
ggplot(aes(x = item_purchased, y = total_sales, fill = season)) +
geom_col(position = "dodge") +
labs(
title = "Clothing Categories by Season",
x = "Clothing purchased",
y = "Number of Purchases"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
subscription_analysis <- data %>%
group_by(subscription_status) %>%
summarise(
avg_purchase_amount = mean(purchase_amount_usd, na.rm = TRUE),
avg_previous_purchases = mean(previous_purchases, na.rm = TRUE),
avg_rating = mean(review_rating, na.rm = TRUE)
)
subscription_analysis
## # A tibble: 2 x 4
## subscription_status avg_purchase_amount avg_previous_purchases avg_rating
## <chr> <dbl> <dbl> <dbl>
## 1 No 59.9 25.1 3.75
## 2 Yes 59.5 26.1 3.74
discount_analysis <- data %>%
group_by(discount_applied) %>%
summarise(
avg_purchase_amount = mean(purchase_amount_usd, na.rm = TRUE),
total_purchases = n()
)
discount_analysis
## # A tibble: 2 x 3
## discount_applied avg_purchase_amount total_purchases
## <chr> <dbl> <int>
## 1 No 60.1 2223
## 2 Yes 59.3 1677
frequency_rating <- data %>%
group_by(frequency_of_purchases) %>%
summarise(
avg_rating = mean(review_rating, na.rm = TRUE),
avg_spend = mean(purchase_amount_usd, na.rm = TRUE)
)
frequency_rating
## # A tibble: 7 x 3
## frequency_of_purchases avg_rating avg_spend
## <chr> <dbl> <dbl>
## 1 Annually 3.76 60.2
## 2 Bi-Weekly 3.71 60.7
## 3 Every 3 Months 3.77 60.1
## 4 Fortnightly 3.76 59.1
## 5 Monthly 3.78 59.3
## 6 Quarterly 3.73 60.0
## 7 Weekly 3.76 59.0
The following measures will have positive impact on customer rating
Understanding customer demographics, seasonal trends, and behavioral patterns provides actionable insights to boost sales and enhance customer satisfaction. Data‑driven decisions will lead to sustainable growth and stronger customer relationships.