Amazon wants to boost revenue and reduce subscription cancellations.
Use sales and customer data to uncover key patterns and recommend actions * Understand what drives product revenue * Identify churn trends in subscriptions * Segment and analyze customer behavior * Provide data-driven recommendations
library(tidyverse)
install.packages("dplyr")
library(dplyr)
sales_data<- read_csv("sales_data.csv")
customer_data<- read_csv("customer_data.csv")
# Calculate total revenue by product category
revenue_by_category <- sales_data %>%
group_by(product_category) %>%
summarise(total_revenue = sum(total_price, na.rm = TRUE)) %>%
arrange(desc(total_revenue))
blue_palette <- c(
"Books" = "#7fb3d5",
"Wellness" = "#D5DEEF",
"Electronics" = "#B1C9EF",
"Clothing" = "#8AAEF0",
"Toys" = "#628ECB",
"Home" = "#395886",
"Grocery" = "#aed6f1"
)
ggplot(revenue_by_category,
aes(x = reorder(product_category, total_revenue),
y = total_revenue, fill = product_category)) +
geom_col() +
geom_text(aes(label = round(total_revenue, 0)), # You can use comma_format() too
hjust = 1.1, color = "white", size = 4, fontface = "bold") +
coord_flip() +
scale_fill_manual(values = blue_palette)+
labs(title = "Revenue by Product Category",
x = "Product Category",
y = "Total Revenue") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.title = element_text(face = "bold"),
legend.position = "none"
)
cltv <- sales_data %>%
group_by(customer_id) %>%
summarise(total_spent = sum(total_price)) %>%
arrange(desc(total_spent))
head(cltv, 10)
## # A tibble: 10 × 2
## customer_id total_spent
## <chr> <dbl>
## 1 CUST0002 3955.
## 2 CUST0133 3653.
## 3 CUST0007 3558.
## 4 CUST0109 3495.
## 5 CUST0038 3444.
## 6 CUST0143 3304.
## 7 CUST0121 3087.
## 8 CUST0024 3082.
## 9 CUST0111 3063.
## 10 CUST0105 3056.
ggplot(cltv, aes(x = total_spent)) +
geom_histogram(fill = "#638ECB", color = "white", bins = 10) +
labs(title = "Distribution of Customer Lifetime Value",
x = "Total Spent", y = "Number of Customers") +
theme_minimal()
library(ggplot2)
buyer_counts <- sales_data %>%
group_by(customer_id) %>%
summarise(purchase_count = n()) %>%
mutate(buyer_type = ifelse(purchase_count == 1, "One-Time", "Repeat")) %>%
count(buyer_type)
ggplot(buyer_counts, aes(x = "", y = n, fill = buyer_type)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(round(n/sum(n)*100), "%")),
position = position_stack(vjust = 0.5), color = "white", size = 5) +
labs(title = "Customer Types: One-Time vs Repeat Buyers",
fill = "Buyer Type") +
theme_void() +
scale_fill_manual(values = c("One-Time" = "#B1C9EF", "Repeat" = "#638ECB")) # blue palette
library(dplyr)
library(ggplot2)
# Categorize customers into tiers
frequency_tiers <- sales_data %>%
group_by(customer_id) %>%
summarise(purchase_count = n()) %>%
mutate(tier = case_when(
purchase_count == 1 ~ "One-Time",
purchase_count <= 3 ~ "Low Frequency (2–3)",
purchase_count <= 5 ~ "Mid Frequency (4–5)",
purchase_count >= 6 ~ "High Frequency (6+)"
)) %>%
count(tier)
# Plot pie chart
ggplot(frequency_tiers, aes(x = "", y = n, fill = tier)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(round(n/sum(n)*100), "%")),
position = position_stack(vjust = 0.5), color = "white", size = 5) +
labs(title = "Customer Purchase Frequency Segments",
fill = "Purchase Tier") +
theme_void() +
scale_fill_manual(values = c(
"One-Time" = "#B1C9EF", # Light blue
"Low Frequency (2–3)" = "#aed6f1",# Deep blue
"Mid Frequency (4–5)" = "#a9cce3",# Lemon green
"High Frequency (6+)" = "#3498db" # Sunset orange
))
📉 Where is churn highest? Note: This plot includes only customers who have churned
customer_data %>%
filter(!is.na(subscription_end)) %>%
mutate(churn_duration = as.numeric(subscription_end - subscription_start)) %>%
ggplot(aes(churn_duration)) +
geom_histogram(binwidth = 10, fill = "#aed6f1", color = "black") +
labs(title = "Distribution of Churn Duration (in Days)",
x = "Days until Churn", y = "Number of Customers")
`
library(dplyr)
review_sales <- sales_data %>%
mutate(rating_group = case_when(
product_rating >= 4.5 ~ "High Rated (4.5+)",
product_rating >= 3.5 ~ "Mid Rated (3.5–4.4)",
TRUE ~ "Low Rated (<3.5)"
)) %>%
group_by(rating_group) %>%
summarise(
total_sales = sum(total_price, na.rm = TRUE),
avg_price = mean(unit_price, na.rm = TRUE),
count = n()
) %>%
arrange(desc(total_sales))
library(ggplot2)
ggplot(review_sales, aes(x = rating_group, y = total_sales, fill = rating_group)) +
geom_col(width = 0.5) +
geom_text(aes(label = paste0("₹", round(total_sales, 0))),
vjust = -0.5, size = 3, fontface = "bold") +
scale_y_continuous(labels = scales::comma_format()) +
scale_fill_manual(values = c(
"High Rated (4.5+)" = "#b3d7ee", # Lemon green
"Mid Rated (3.5–4.4)" = "#dcecf5", # Blue
"Low Rated (<3.5)" = "#94d2fa" # Sunset orange
)) +
labs(title = "Impact of Product Reviews on Total Sales",
x = "Rating Group",
y = "Total Sales") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 15),
axis.title = element_text(face = "bold"),
legend.position = "none"
)
# Churn rate by subscription tier
library(ggplot2)
library(dplyr)
library(tidyr)
# Prepare the data
churn_data <- customer_data %>%
mutate(churned = ifelse(!is.na(subscription_end), "Yes", "No")) %>%
group_by(subscription_tier, churned) %>%
summarise(count = n(), .groups = "drop") %>%
pivot_wider(names_from = churned, values_from = count, values_fill = 0) %>%
mutate(total = Yes + No,
churn_rate = Yes / total)
# Create donut plot
ggplot(churn_data, aes(x = 2, y = churn_rate, fill = subscription_tier)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
xlim(0.5, 2.5) +
geom_text(aes(label = paste0(subscription_tier, "\n", round(churn_rate*100, 1), "%")),
position = position_stack(vjust = 0.5), color = "white", size = 5) +
theme_void() +
labs(title = "Churn Rate by Subscription Tier",
fill = "Subscription Tier") +
theme(legend.position = "none")+
scale_fill_manual(values = c(
"Basic" = "#85c8f3",
"Standard" = "#467a9c",
"Premium" = "#b3d7ee"
))
# Product price by Rating group
library(tidyr)
library(ggplot2)
sales_data %>%
mutate(rating_group = case_when(
product_rating >= 4.5 ~ "High Rated (4.5+)",
product_rating >= 3.5 ~ "Mid Rated (3.5–4.4)",
TRUE ~ "Low Rated (<3.5)"
)) %>%
group_by(rating_group) %>%
summarise(
Avg_Price = mean(unit_price, na.rm = TRUE),
Median_Price = median(unit_price, na.rm = TRUE)
) %>%
pivot_longer(cols = c(Avg_Price, Median_Price), names_to = "Metric", values_to = "Price") %>%
ggplot(aes(x = rating_group, y = Price, fill = Metric)) +
geom_col(position = "dodge") +
labs(title = "Average vs Median Price by Rating Group",
x = "Rating Group", y = "Price (₹)", fill = "Metric") +
scale_fill_manual(values = c("Avg_Price" = "#4798cc", "Median_Price" = "#a8dbfb")) +
theme_minimal()
These insights will guide Amazon to tweak pricing strategy, improve subscription tiers, and enhance product targeting.