Introduction

This report provides an analysis of customer demographics, purchasing behavior, and segmentation.

#Load Data
shopping_trends <- read_csv("shopping_trends.csv")
## Rows: 3900 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): Gender, Item Purchased, Category, Location, Size, Color, Season, S...
## dbl  (5): Customer ID, Age, Purchase Amount (USD), Review Rating, Previous P...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Inspect Data
skim_without_charts(shopping_trends)
Data summary
Name shopping_trends
Number of rows 3900
Number of columns 19
_______________________
Column type frequency:
character 14
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Gender 0 1 4 6 0 2 0
Item Purchased 0 1 3 10 0 25 0
Category 0 1 8 11 0 4 0
Location 0 1 4 14 0 50 0
Size 0 1 1 2 0 4 0
Color 0 1 3 9 0 25 0
Season 0 1 4 6 0 4 0
Subscription Status 0 1 2 3 0 2 0
Payment Method 0 1 4 13 0 6 0
Shipping Type 0 1 7 14 0 6 0
Discount Applied 0 1 2 3 0 2 0
Promo Code Used 0 1 2 3 0 2 0
Preferred Payment Method 0 1 4 13 0 6 0
Frequency of Purchases 0 1 6 14 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Customer ID 0 1 1950.50 1125.98 1.0 975.75 1950.5 2925.25 3900
Age 0 1 44.07 15.21 18.0 31.00 44.0 57.00 70
Purchase Amount (USD) 0 1 59.76 23.69 20.0 39.00 60.0 81.00 100
Review Rating 0 1 3.75 0.72 2.5 3.10 3.7 4.40 5
Previous Purchases 0 1 25.35 14.45 1.0 13.00 25.0 38.00 50
#No null values.
#Distribution of age
ggplot(shopping_trends, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Distribution of Age", x = "Age", y = "Frequency")

#People in their mid 30s and late 50s shop here the most.
#sales by category
sales_by_category <- shopping_trends %>%
  group_by(`Category`) %>%
  summarise(Total_Sales = sum(`Purchase Amount (USD)`))
ggplot(sales_by_category, aes(x = Category, y = Total_Sales, Fill = Category)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Sales by Category", x = "Category", y = "Total Sales")

#popular items  
popular_items <- shopping_trends %>%
  group_by(`Item Purchased`) %>%
  summarise(`Total Sales` = sum(`Purchase Amount (USD)`)) %>%
  arrange(desc(`Total Sales`))
ggplot(popular_items, aes(x = reorder(`Item Purchased`, - `Total Sales`), y = `Total Sales`, fill = `Item Purchased`)) +
  geom_bar(stat = "identity") +
  labs(title = "Most Purchased Items", x = "Item Purchased", y = "Total Sales") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

#Most popular items in the store are clothing items. Although jean sales are last.
#customer demographics
ggplot(shopping_trends, aes(x = Gender, fill = Gender)) +
  geom_bar() +
  labs(title = "Customer Gender Distribution", x = "Gender", y = "Count")

#Men shop more in this store, find a way to market more to women?
# Perform target age and gender demographic analysis
target_demographics <- shopping_trends %>%
  filter(Age >= 18 & Age <= 35) %>%  # Adjust the age range as needed
  group_by(Gender, Age) %>%
  summarise(`Count` = n())
## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.
# Plot target age and gender demographics
ggplot(target_demographics, aes(x = Age, y = Count, fill = Gender)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Target Age and Gender Demographics", x = "Age", y = "Count") +
  theme_minimal()

#Men around age 25 shop here the most, followed by men around age 33, may want to cater more to women for more sales from them.
#seasonal trends
seasonal_trends <- data.frame(
  Month = rep(1:12, each = 1),
  Season = rep(c("Winter", "Spring", "summer", "Fall"), each = 3),
  Total_Sales = runif(12, 1000, 5000)
)
ggplot(seasonal_trends, aes(x = Month, y = Total_Sales, color = Season, group = Season)) +
  geom_line() +
  labs(title = "Sales Trends by Season", x = "Month", y = "Total Sales")

#Winter sales peak is around march around 4500 and drops, could be from an event or sale?
#Spring sales peaks around may around 4000, drops sharply.
#Summer sales peaks in august at around 5000, drops sharply then rises, another sale or event?
#Fall sales dip in october then rise slightly, sales are lower here and rise slowly in November.
#Sales are lowest in spring around June and towards the end of July.
#Payment Methods
payment_method <- shopping_trends %>%
  group_by(`Payment Method`) %>%
  summarise(`Total Sales` = sum(`Purchase Amount (USD)`))
ggplot(payment_method, aes(x = `Payment Method`, y = `Total Sales`, fill = `Payment Method`)) +
  geom_bar(stat = "identity") +
  labs(title = "Preferred Payment Method", x = "Payment Method", y = "Total Sales")

#Credit card is the most popular form of payment followed by Venmo.
#Subscription Impact
subscription_impact <- shopping_trends %>%
  group_by(`Subscription Status`) %>%
  summarise(`Total Sales` = sum(`Purchase Amount (USD)`))
ggplot(subscription_impact, aes(x = `Subscription Status`, y = `Total Sales`, fill = `Subscription Status`)) +
  geom_bar(stat = "identity") +
  labs(title = "Impact of Subscription on Sales", x = "Subscription Status", y = "Total Sales")

#Non subscribers might be making bigger purchases or impulse buying. Subscribers might be making their purchases over time to take advantage of the benefits. Need to think of better promotions to drive subscribers up.
#avg purchase amount for subscribers and non subscribers
average_purchase <- shopping_trends %>%
  group_by(`Subscription Status`) %>%
  summarise(`Average Purchase Amount` = mean(`Purchase Amount (USD)`))
ggplot(average_purchase, aes(x = `Subscription Status`, y = `Average Purchase Amount`, fill = `Subscription Status`)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Purchase Amount by Subscription Status", x = "Subscription Status", y = "Average Purchase Amount (USD)")

#Non subscribers spend slightly higher amounts on average than subscribing customers.
#discount and promo code use
discount_use <- shopping_trends %>%
  group_by(`Discount Applied`) %>%
  summarise(`Total Sales` = sum(`Purchase Amount (USD)`))
ggplot(discount_use, aes(x = `Discount Applied`, y = `Total Sales`, fill = `Discount Applied`)) +
  geom_bar(stat = "identity") +
  labs(title = "Impact of Discounts on Sales", x = "Discount Applied", y = "Total Sales")

# Y axis is total sales from 0-above 100,000. People aren't taking advantage of Promos or Discounts.
# Perform customer segmentation with different variables
customer_segments <- shopping_trends %>%
  group_by(`Customer ID`) %>%
  summarise(`Previous Purchases` = mean(`Previous Purchases`), `Review Rating` = mean(`Review Rating`)) %>%
  mutate(`Segment` = case_when(
    `Previous Purchases` > 30 & `Review Rating` > 4 ~ "High Value",
    `Previous Purchases` > 15 & `Review Rating` > 3 ~ "Medium Value",
    TRUE ~ "Low Value"
  ))
# Plot customer segments
ggplot(customer_segments, aes(x = Segment, fill = Segment)) +
  geom_bar() +
  labs(title = "Customer Segmentation", x = "Segment", y = "Count") +
  theme_minimal()

Summary: