E-Commerce Sales Analysis

knitr::opts_chunk$set(echo = TRUE)

Introduction This analysis explores an e-commerce dataset to uncover insights related to revenue, sales transactions, customer behavior, product performance, and more. The dataset includes information on product pricing, categories, reviews, and discounts.

# Load required libraries 
library(readxl)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringr)
library(ggplot2)
library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Load The Dataset

data <- read_csv("C:/Users/USER/Downloads/datasets/amazon sales/amazon.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 1465 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): product_id, product_name, category, discounted_price, actual_price...
## dbl  (1): rating
## num  (1): rating_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View structure and summary 
glimpse(data)

## Rows: 1,465
## Columns: 16
## $ product_id          <chr> "B07JW9H4J1", "B098NS6PVG", "B096MSW6CT", "B08HDJ8…
## $ product_name        <chr> "Wayona Nylon Braided USB to Lightning Fast Chargi…
## $ category            <chr> "Computers&Accessories|Accessories&Peripherals|Cab…
## $ discounted_price    <chr> "₹399", "₹199", "₹199", "₹329", "₹154", "₹149", "₹…
## $ actual_price        <chr> "₹1,099", "₹349", "₹1,899", "₹699", "₹399", "₹1,00…
## $ discount_percentage <chr> "64%", "43%", "90%", "53%", "61%", "85%", "65%", "…
## $ rating              <dbl> 4.2, 4.0, 3.9, 4.2, 4.2, 3.9, 4.1, 4.3, 4.2, 4.0, …
## $ rating_count        <dbl> 24269, 43994, 7928, 94363, 16905, 24871, 15188, 30…
## $ about_product       <chr> "High Compatibility : Compatible With iPhone 12, 1…
## $ user_id             <chr> "AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBBSNL…
## $ user_name           <chr> "Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jaspree…
## $ review_id           <chr> "R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1KD19…
## $ review_title        <chr> "Satisfied,Charging is really fast,Value for money…
## $ review_content      <chr> "Looks durable Charging is fine tooNo complains,Ch…
## $ img_link            <chr> "https://m.media-amazon.com/images/W/WEBP_402378-T…
## $ product_link        <chr> "https://www.amazon.in/Wayona-Braided-WN3LG1-Synci…

summary(data)

##   product_id        product_name         category         discounted_price  
##  Length:1465        Length:1465        Length:1465        Length:1465       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  actual_price       discount_percentage     rating       rating_count   
##  Length:1465        Length:1465         Min.   :2.000   Min.   :     2  
##  Class :character   Class :character    1st Qu.:4.000   1st Qu.:  1186  
##  Mode  :character   Mode  :character    Median :4.100   Median :  5179  
##                                         Mean   :4.097   Mean   : 18296  
##                                         3rd Qu.:4.300   3rd Qu.: 17337  
##                                         Max.   :5.000   Max.   :426973  
##                                         NA's   :1       NA's   :2       
##  about_product        user_id           user_name          review_id        
##  Length:1465        Length:1465        Length:1465        Length:1465       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  review_title       review_content       img_link         product_link      
##  Length:1465        Length:1465        Length:1465        Length:1465       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##

Data Cleaning and Handling Missing Values

# Check for missing values
colSums(is.na(data))

##          product_id        product_name            category    discounted_price 
##                   0                   0                   0                   0 
##        actual_price discount_percentage              rating        rating_count 
##                   0                   0                   1                   2 
##       about_product             user_id           user_name           review_id 
##                   0                   0                   0                   0 
##        review_title      review_content            img_link        product_link 
##                   0                   0                   0                   0

# Remove rows with missing values
data_clean <- data %>%
  drop_na()

# Check updated dataset
dim(data_clean)

## [1] 1462   16

colSums(is.na(data_clean))

##          product_id        product_name            category    discounted_price 
##                   0                   0                   0                   0 
##        actual_price discount_percentage              rating        rating_count 
##                   0                   0                   0                   0 
##       about_product             user_id           user_name           review_id 
##                   0                   0                   0                   0 
##        review_title      review_content            img_link        product_link 
##                   0                   0                   0                   0

Data Type Conversion of columns needed for analysis

# Convert relevant columns to appropriate data types
data_clean <- data_clean %>%
  mutate(
    product_id = as.factor(product_id),
    category = as.factor(category),
    user_id = as.factor(user_id),
    review_id = as.factor(review_id),
    
    discounted_price = as.numeric(gsub("[^0-9.]", "", discounted_price)),
    actual_price = as.numeric(gsub("[^0-9.]", "", actual_price)),
    discount_percentage = as.numeric(gsub("[^0-9.]", "", discount_percentage)),
    rating = as.numeric(rating),
    
    rating_count = as.integer(rating_count),
    product_name = as.character(product_name),
    about_product = as.character(about_product),
    user_name = as.character(user_name),
    review_title = as.character(review_title),
    review_content = as.character(review_content),
    img_link = as.character(img_link),
    product_link = as.character(product_link),
    profit_margin = actual_price - discounted_price)

# View the cleaned structure
glimpse(data_clean)

## Rows: 1,462
## Columns: 17
## $ product_id          <fct> B07JW9H4J1, B098NS6PVG, B096MSW6CT, B08HDJ86NZ, B0…
## $ product_name        <chr> "Wayona Nylon Braided USB to Lightning Fast Chargi…
## $ category            <fct> "Computers&Accessories|Accessories&Peripherals|Cab…
## $ discounted_price    <dbl> 399.00, 199.00, 199.00, 329.00, 154.00, 149.00, 17…
## $ actual_price        <dbl> 1099, 349, 1899, 699, 399, 1000, 499, 299, 999, 29…
## $ discount_percentage <dbl> 64, 43, 90, 53, 61, 85, 65, 23, 50, 33, 55, 63, 69…
## $ rating              <dbl> 4.2, 4.0, 3.9, 4.2, 4.2, 3.9, 4.1, 4.3, 4.2, 4.0, …
## $ rating_count        <int> 24269, 43994, 7928, 94363, 16905, 24871, 15188, 30…
## $ about_product       <chr> "High Compatibility : Compatible With iPhone 12, 1…
## $ user_id             <fct> "AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBBSNL…
## $ user_name           <chr> "Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jaspree…
## $ review_id           <fct> "R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1KD19…
## $ review_title        <chr> "Satisfied,Charging is really fast,Value for money…
## $ review_content      <chr> "Looks durable Charging is fine tooNo complains,Ch…
## $ img_link            <chr> "https://m.media-amazon.com/images/W/WEBP_402378-T…
## $ product_link        <chr> "https://www.amazon.in/Wayona-Braided-WN3LG1-Synci…
## $ profit_margin       <dbl> 700.00, 150.00, 1700.00, 370.00, 245.00, 851.00, 3…

Exploratory Data Analysis Key Sales Metrics

# Add a sales_amount column
data_clean <- data_clean %>%
  mutate(sales_amount = discounted_price)

# Calculate Key Metrics
# Total revenue
total_revenue <- sum(data_clean$sales_amount, na.rm = TRUE)

# Total number of transactions
total_transactions <- nrow(data_clean)

# Average and median sales values
avg_sales <- mean(data_clean$sales_amount, na.rm = TRUE)
median_sales <- median(data_clean$sales_amount, na.rm = TRUE)

# Display Metrics
key_metrics <- list(
  total_revenue = total_revenue,
  total_transactions = total_transactions,
  avg_sales = avg_sales,
  median_sales = median_sales
)
print(key_metrics)

## $total_revenue
## [1] 4576033
## 
## $total_transactions
## [1] 1462
## 
## $avg_sales
## [1] 3129.982
## 
## $median_sales
## [1] 799

Which Categories Generate the Most Revenue?

category_revenue <- data_clean %>%
  group_by(category) %>%
  summarise(
    total_revenue = sum(sales_amount, na.rm = TRUE),
    avg_revenue = mean(sales_amount, na.rm = TRUE)
  ) %>%
  arrange(desc(total_revenue))

# View Category Results
print(category_revenue)

## # A tibble: 211 × 3
##    category                                            total_revenue avg_revenue
##    <fct>                                                       <dbl>       <dbl>
##  1 Electronics|HomeTheater,TV&Video|Televisions|Smart…      1564932       24840.
##  2 Electronics|Mobiles&Accessories|Smartphones&BasicM…      1071302       15754.
##  3 Electronics|WearableTechnology|SmartWatches               177817        2340.
##  4 Home&Kitchen|Kitchen&HomeAppliances|WaterPurifiers…        84183        7015.
##  5 Computers&Accessories|Accessories&Peripherals|Cabl…        83601.        362.
##  6 Home&Kitchen|Kitchen&HomeAppliances|SmallKitchenAp…        81127.       3005.
##  7 Home&Kitchen|Heating,Cooling&AirQuality|WaterHeate…        75880        6323.
##  8 Home&Kitchen|Heating,Cooling&AirQuality|WaterHeate…        55250        2402.
##  9 Electronics|Headphones,Earbuds&Accessories|Headpho…        50761         976.
## 10 Home&Kitchen|Heating,Cooling&AirQuality|RoomHeater…        49403        2470.
## # ℹ 201 more rows

What are the top-selling and least-selling products

# Adding short product names for better visual output
data_clean <- data_clean %>%
  mutate(short_product_name = str_extract(product_name, "^([^\\s]+\\s+){0,2}[^\\s]+"))

# Group and summarize products
top_products <- data_clean %>%
  group_by(short_product_name) %>%
  summarise(
    total_revenue = sum(sales_amount, na.rm = TRUE),
    total_quantity = sum(rating_count, na.rm = TRUE)
  ) %>%
  arrange(desc(total_revenue))

kable(top_products %>% head(10), caption = "Top 10 Products by Revenue")

Top 10 Products by Revenue
short_product_name	total_revenue	total_quantity
Redmi Note 11	115993	280560
Samsung Galaxy M13	103992	151984
Samsung 138 cm	93989	14218
OnePlus Nord 2T	91997	52245
Sony Bravia 164	77990	5935
Acer 139 cm	68998	6314
Samsung 108 cm	63980	14218
OnePlus 163.8 cm	61999	6753
iQOO Z6 Pro	61997	28497
iQOO Z6 44W	58996	77009

kable(top_products %>% tail(10), caption = "Bottom 10 Products by Revenue")

Bottom 10 Products by Revenue
short_product_name	total_revenue	total_quantity
STRIFF Wall Mount	89.00	9340
VR 18 Pcs	89.00	19621
pTron Solero M241	89.00	1075
Empty Mist Trigger	85.00	212
Kitchenwell 18Pc Plastic	79.00	97
Gizga Essentials Webcam	69.00	255
Classmate Pulse Spiral	67.00	1269
FLiX (Beetel Flow	57.89	9378
GIZGA essentials Universal	39.00	15233
Inventis 5V 1.2W	39.00	13572

Analyze total sales and revenue by product category

# Analyze total sales and revenue by product category
category_performance <- data_clean %>%
  separate(category, into = c("main_category", "sub_category"), sep = "\\|", extra = "merge", fill = "right") %>%
  group_by(main_category) %>%
  summarise(
    total_revenue = sum(sales_amount, na.rm = TRUE),
    total_sales_volume = sum(rating_count, na.rm = TRUE)
  ) %>%
  arrange(desc(total_revenue))

# Print the results
print(category_performance)

## # A tibble: 9 × 3
##   main_category         total_revenue total_sales_volume
##   <chr>                         <dbl>              <int>
## 1 Electronics                3138057            15778848
## 2 Home&Kitchen               1042017.            2990077
## 3 Computers&Accessories       381273.            7728689
## 4 OfficeProducts                9349              149675
## 5 Car&Motorbike                 2339                1118
## 6 MusicalInstruments            1276               88882
## 7 Health&PersonalCare            899                3663
## 8 HomeImprovement                674                8566
## 9 Toys&Games                     150               15867

Customer Analysis Who are the top customers based on revenue or sales volume?

# Analyze customer revenue
customer_analysis <- data_clean %>%
  mutate(first_name = str_extract(user_name, "^[^, ]+")) %>%
  group_by(first_name) %>%
  summarise(
    total_revenue = sum(sales_amount * rating_count, na.rm = TRUE),
    total_sales_volume = sum(rating_count, na.rm = TRUE)
  ) %>%
  arrange(desc(total_revenue))

kable(customer_analysis %>% head(10), caption = "Top 10 Customers by Revenue")

Top 10 Customers by Revenue
first_name	total_revenue	total_sales_volume
Roshan	9099932664	1255336
prateeq	3980749049	180951
Amazon	3729471981	511369
Manoj	3288714222	167822
Meghnad	2639940912	203088
Satheesh	1975640016	151984
ATHARVA	1954239303	104697
siddharth	1732579987	115513
Indro	1652658528	77472
Kapil	1622826534	95466

Are there repeat customers?

# Add a new column with only the first name extracted from `user_name`
data_clean <- data_clean %>%
  mutate(first_name = str_extract(user_name, "^[^, ]+"))

repeat_customers <- data_clean %>%
  group_by(first_name) %>%
  summarise(transaction_count = n()) %>%
  filter(transaction_count > 1)

# View repeat customers
print(repeat_customers)

## # A tibble: 214 × 2
##    first_name     transaction_count
##    <chr>                      <int>
##  1 "$@|\\|TO$|-|"                10
##  2 "A"                            2
##  3 "ABHISHEK"                     2
##  4 "ANURAG"                       2
##  5 "ASR"                          3
##  6 "ATHARVA"                      3
##  7 "AV"                           6
##  8 "Abhay"                        4
##  9 "Abhishek"                     5
## 10 "Actual"                       5
## # ℹ 204 more rows

# Segment customers based on purchase volume
customer_segments <- data_clean %>%
  group_by(user_id) %>%
  summarise(
    total_purchases = sum(rating_count, na.rm = TRUE),
    total_spent = sum(sales_amount, na.rm = TRUE)
  ) %>%
  mutate(
    customer_segment = case_when(
      total_purchases <= 5 ~ "Low Purchaser",
      total_purchases > 5 & total_purchases <= 15 ~ "Medium Purchaser",
      total_purchases > 15 ~ "High Purchaser"
    )
  ) %>%
  group_by(customer_segment) %>%
  summarise(
    avg_spent_per_customer = mean(total_spent, na.rm = TRUE),
    total_customers = n()
  )

# Print the results
print(customer_segments)

## # A tibble: 3 × 3
##   customer_segment avg_spent_per_customer total_customers
##   <chr>                             <dbl>           <int>
## 1 High Purchaser                    3877.            1174
## 2 Low Purchaser                      762.               6
## 3 Medium Purchaser                  1786.              11

Discount and Promotion Analysis Correlation Between Discounts and Sales Volume

correlation <- cor(data_clean$discount_percentage, data_clean$rating_count, use = "complete.obs")
cat("Correlation between discount percentage and sales volume is:", correlation)

## Correlation between discount percentage and sales volume is: 0.01129439

Discount Range Analysis

discount_analysis <- data_clean %>%
  mutate(
    discount_range = case_when(
      discount_percentage >= 0 & discount_percentage < 20 ~ "0-20%",
      discount_percentage >= 20 & discount_percentage < 40 ~ "20-40%",
      discount_percentage >= 40 & discount_percentage < 60 ~ "40-60%",
      discount_percentage >= 60 ~ "60%+"
    )
  ) %>%
  group_by(discount_range) %>%
  summarise(
    total_sales_volume = sum(rating_count, na.rm = TRUE),
    total_revenue = sum(sales_amount, na.rm = TRUE),
    avg_rating = mean(rating, na.rm = TRUE)
  )

# Create a table with proper spacing between the title and content
discount_analysis %>%
  kable(caption = "Discount Range Analysis") %>%
  kable_styling(latex_options = c("hold_position"), font_size = 12) %>%
  row_spec(0, extra_css = "padding-bottom: 15px;")  # Adds space below the header row

Discount Range Analysis
discount_range	total_sales_volume	total_revenue	avg_rating
0-20%	2467316	643894.0	4.168485
20-40%	6331403	2112311.9	4.134568
40-60%	7764342	1383549.3	4.089583
60%+	10202324	436278.2	4.054767

Grouping products by rating range

rating_analysis <- data_clean %>%
  mutate(
    rating_range = case_when(
      rating >= 0 & rating < 3 ~ "Low (0-3)",
      rating >= 3 & rating < 4 ~ "Medium (3-4)",
      rating >= 4 ~ "High (4+)"
    )
  ) %>%
  group_by(rating_range) %>%
  summarise(
    total_sales_volume = sum(rating_count, na.rm = TRUE),
    total_revenue = sum(sales_amount, na.rm = TRUE)
  )

# Print the results
print(rating_analysis)

## # A tibble: 3 × 3
##   rating_range total_sales_volume total_revenue
##   <chr>                     <int>         <dbl>
## 1 High (4+)              23522400      3977642.
## 2 Low (0-3)                   394         3108 
## 3 Medium (3-4)            3242591       595283.

E-Commerce Sales Analysis

Mark Mayana

2024-12-19