What is Amazon trying to solve?

Amazon wants to boost revenue and reduce subscription cancellations.

How?

Use sales and customer data to uncover key patterns and recommend actions * Understand what drives product revenue * Identify churn trends in subscriptions * Segment and analyze customer behavior * Provide data-driven recommendations

library(tidyverse)
install.packages("dplyr")
library(dplyr)
sales_data<- read_csv("sales_data.csv")
customer_data<- read_csv("customer_data.csv")

Revenue by product category

# Calculate total revenue by product category
revenue_by_category <- sales_data %>%
  group_by(product_category) %>%
  summarise(total_revenue = sum(total_price, na.rm = TRUE)) %>%
  arrange(desc(total_revenue))
blue_palette <- c(
  "Books" = "#7fb3d5",      
  "Wellness" = "#D5DEEF",
  "Electronics" = "#B1C9EF",
  "Clothing" = "#8AAEF0",
  "Toys" = "#628ECB",
  "Home" = "#395886",        
  "Grocery" = "#aed6f1"     
)

ggplot(revenue_by_category, 
       aes(x = reorder(product_category, total_revenue), 
           y = total_revenue, fill = product_category)) +
  geom_col() +
  geom_text(aes(label = round(total_revenue, 0)),  # You can use comma_format() too
            hjust = 1.1, color = "white", size = 4, fontface = "bold") +
  coord_flip() +
  scale_fill_manual(values = blue_palette)+
  labs(title = "Revenue by Product Category",
       x = "Product Category",
       y = "Total Revenue") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.title = element_text(face = "bold"),
    legend.position = "none"
  )

Who are the best customers (based on CLTV)?

cltv <- sales_data %>%
  group_by(customer_id) %>%
  summarise(total_spent = sum(total_price)) %>%
  arrange(desc(total_spent))

head(cltv, 10)
## # A tibble: 10 × 2
##    customer_id total_spent
##    <chr>             <dbl>
##  1 CUST0002          3955.
##  2 CUST0133          3653.
##  3 CUST0007          3558.
##  4 CUST0109          3495.
##  5 CUST0038          3444.
##  6 CUST0143          3304.
##  7 CUST0121          3087.
##  8 CUST0024          3082.
##  9 CUST0111          3063.
## 10 CUST0105          3056.
ggplot(cltv, aes(x = total_spent)) +
  geom_histogram(fill = "#638ECB", color = "white", bins = 10) +
  labs(title = "Distribution of Customer Lifetime Value",
       x = "Total Spent", y = "Number of Customers") +
  theme_minimal()

library(ggplot2)

buyer_counts <- sales_data %>%
  group_by(customer_id) %>%
  summarise(purchase_count = n()) %>%
  mutate(buyer_type = ifelse(purchase_count == 1, "One-Time", "Repeat")) %>%
  count(buyer_type)

ggplot(buyer_counts, aes(x = "", y = n, fill = buyer_type)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(round(n/sum(n)*100), "%")), 
            position = position_stack(vjust = 0.5), color = "white", size = 5) +
  labs(title = "Customer Types: One-Time vs Repeat Buyers",
       fill = "Buyer Type") +
  theme_void() +
  scale_fill_manual(values = c("One-Time" = "#B1C9EF", "Repeat" = "#638ECB"))  # blue palette

Customer Purchase Behavior

library(dplyr)
library(ggplot2)

# Categorize customers into tiers
frequency_tiers <- sales_data %>%
  group_by(customer_id) %>%
  summarise(purchase_count = n()) %>%
  mutate(tier = case_when(
    purchase_count == 1 ~ "One-Time",
    purchase_count <= 3 ~ "Low Frequency (2–3)",
    purchase_count <= 5 ~ "Mid Frequency (4–5)",
    purchase_count >= 6 ~ "High Frequency (6+)"
  )) %>%
  count(tier)

# Plot pie chart
ggplot(frequency_tiers, aes(x = "", y = n, fill = tier)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(round(n/sum(n)*100), "%")),
            position = position_stack(vjust = 0.5), color = "white", size = 5) +
  labs(title = "Customer Purchase Frequency Segments",
       fill = "Purchase Tier") +
  theme_void() +
  scale_fill_manual(values = c(
    "One-Time" = "#B1C9EF",           # Light blue
    "Low Frequency (2–3)" = "#aed6f1",# Deep blue
    "Mid Frequency (4–5)" = "#a9cce3",# Lemon green
    "High Frequency (6+)" = "#3498db" # Sunset orange
  ))

📉 Where is churn highest? Note: This plot includes only customers who have churned

customer_data %>%
  filter(!is.na(subscription_end)) %>%
  mutate(churn_duration = as.numeric(subscription_end - subscription_start)) %>%
  ggplot(aes(churn_duration)) +
  geom_histogram(binwidth = 10, fill = "#aed6f1", color = "black") +
  labs(title = "Distribution of Churn Duration (in Days)",
       x = "Days until Churn", y = "Number of Customers")

`

Impact of Product Ratings on Revenue

library(dplyr)

review_sales <- sales_data %>%
  mutate(rating_group = case_when(
    product_rating >= 4.5 ~ "High Rated (4.5+)",
    product_rating >= 3.5 ~ "Mid Rated (3.5–4.4)",
    TRUE ~ "Low Rated (<3.5)"
  )) %>%
  group_by(rating_group) %>%
  summarise(
    total_sales = sum(total_price, na.rm = TRUE),
    avg_price = mean(unit_price, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(desc(total_sales))
library(ggplot2)

ggplot(review_sales, aes(x = rating_group, y = total_sales, fill = rating_group)) +
  geom_col(width = 0.5) +
  geom_text(aes(label = paste0("₹", round(total_sales, 0))),
            vjust = -0.5, size = 3, fontface = "bold") +
  scale_y_continuous(labels = scales::comma_format()) +
  scale_fill_manual(values = c(
    "High Rated (4.5+)" = "#b3d7ee",     # Lemon green
    "Mid Rated (3.5–4.4)" = "#dcecf5",  # Blue
    "Low Rated (<3.5)" = "#94d2fa"      # Sunset orange
  )) +
  labs(title = "Impact of Product Reviews on Total Sales",
       x = "Rating Group",
       y = "Total Sales") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    axis.title = element_text(face = "bold"),
    legend.position = "none"
  )

# Churn rate by subscription tier

library(ggplot2)
library(dplyr)
library(tidyr)

# Prepare the data
churn_data <- customer_data %>%
  mutate(churned = ifelse(!is.na(subscription_end), "Yes", "No")) %>%
  group_by(subscription_tier, churned) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(names_from = churned, values_from = count, values_fill = 0) %>%
  mutate(total = Yes + No,
         churn_rate = Yes / total)

# Create donut plot
ggplot(churn_data, aes(x = 2, y = churn_rate, fill = subscription_tier)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  xlim(0.5, 2.5) +
  geom_text(aes(label = paste0(subscription_tier, "\n", round(churn_rate*100, 1), "%")),
            position = position_stack(vjust = 0.5), color = "white", size = 5) +
  theme_void() +
  labs(title = "Churn Rate by Subscription Tier",
       fill = "Subscription Tier") +
  theme(legend.position = "none")+
  scale_fill_manual(values = c(
    "Basic" = "#85c8f3",    
    "Standard" = "#467a9c",  
    "Premium" = "#b3d7ee"    
  ))

# Product price by Rating group

library(tidyr)
library(ggplot2)

sales_data %>%
  mutate(rating_group = case_when(
    product_rating >= 4.5 ~ "High Rated (4.5+)",
    product_rating >= 3.5 ~ "Mid Rated (3.5–4.4)",
    TRUE ~ "Low Rated (<3.5)"
  )) %>%
  group_by(rating_group) %>%
  summarise(
    Avg_Price = mean(unit_price, na.rm = TRUE),
    Median_Price = median(unit_price, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c(Avg_Price, Median_Price), names_to = "Metric", values_to = "Price") %>%
  ggplot(aes(x = rating_group, y = Price, fill = Metric)) +
  geom_col(position = "dodge") +
  labs(title = "Average vs Median Price by Rating Group",
       x = "Rating Group", y = "Price (₹)", fill = "Metric") +
  scale_fill_manual(values = c("Avg_Price" = "#4798cc", "Median_Price" = "#a8dbfb")) +
  theme_minimal()

Insights

These insights will guide Amazon to tweak pricing strategy, improve subscription tiers, and enhance product targeting.