knitr::opts_chunk$set(echo = TRUE)

Introduction This analysis explores an e-commerce dataset to uncover insights related to revenue, sales transactions, customer behavior, product performance, and more. The dataset includes information on product pricing, categories, reviews, and discounts.

# Load required libraries 
library(readxl)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(ggplot2)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Load The Dataset

data <- read_csv("C:/Users/USER/Downloads/datasets/amazon sales/amazon.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 1465 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): product_id, product_name, category, discounted_price, actual_price...
## dbl  (1): rating
## num  (1): rating_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View structure and summary 
glimpse(data)
## Rows: 1,465
## Columns: 16
## $ product_id          <chr> "B07JW9H4J1", "B098NS6PVG", "B096MSW6CT", "B08HDJ8…
## $ product_name        <chr> "Wayona Nylon Braided USB to Lightning Fast Chargi…
## $ category            <chr> "Computers&Accessories|Accessories&Peripherals|Cab…
## $ discounted_price    <chr> "₹399", "₹199", "₹199", "₹329", "₹154", "₹149", "₹…
## $ actual_price        <chr> "₹1,099", "₹349", "₹1,899", "₹699", "₹399", "₹1,00…
## $ discount_percentage <chr> "64%", "43%", "90%", "53%", "61%", "85%", "65%", "…
## $ rating              <dbl> 4.2, 4.0, 3.9, 4.2, 4.2, 3.9, 4.1, 4.3, 4.2, 4.0, …
## $ rating_count        <dbl> 24269, 43994, 7928, 94363, 16905, 24871, 15188, 30…
## $ about_product       <chr> "High Compatibility : Compatible With iPhone 12, 1…
## $ user_id             <chr> "AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBBSNL…
## $ user_name           <chr> "Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jaspree…
## $ review_id           <chr> "R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1KD19…
## $ review_title        <chr> "Satisfied,Charging is really fast,Value for money…
## $ review_content      <chr> "Looks durable Charging is fine tooNo complains,Ch…
## $ img_link            <chr> "https://m.media-amazon.com/images/W/WEBP_402378-T…
## $ product_link        <chr> "https://www.amazon.in/Wayona-Braided-WN3LG1-Synci…
summary(data)
##   product_id        product_name         category         discounted_price  
##  Length:1465        Length:1465        Length:1465        Length:1465       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  actual_price       discount_percentage     rating       rating_count   
##  Length:1465        Length:1465         Min.   :2.000   Min.   :     2  
##  Class :character   Class :character    1st Qu.:4.000   1st Qu.:  1186  
##  Mode  :character   Mode  :character    Median :4.100   Median :  5179  
##                                         Mean   :4.097   Mean   : 18296  
##                                         3rd Qu.:4.300   3rd Qu.: 17337  
##                                         Max.   :5.000   Max.   :426973  
##                                         NA's   :1       NA's   :2       
##  about_product        user_id           user_name          review_id        
##  Length:1465        Length:1465        Length:1465        Length:1465       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  review_title       review_content       img_link         product_link      
##  Length:1465        Length:1465        Length:1465        Length:1465       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
## 

Data Cleaning and Handling Missing Values

# Check for missing values
colSums(is.na(data))
##          product_id        product_name            category    discounted_price 
##                   0                   0                   0                   0 
##        actual_price discount_percentage              rating        rating_count 
##                   0                   0                   1                   2 
##       about_product             user_id           user_name           review_id 
##                   0                   0                   0                   0 
##        review_title      review_content            img_link        product_link 
##                   0                   0                   0                   0
# Remove rows with missing values
data_clean <- data %>%
  drop_na()

# Check updated dataset
dim(data_clean)
## [1] 1462   16
colSums(is.na(data_clean))
##          product_id        product_name            category    discounted_price 
##                   0                   0                   0                   0 
##        actual_price discount_percentage              rating        rating_count 
##                   0                   0                   0                   0 
##       about_product             user_id           user_name           review_id 
##                   0                   0                   0                   0 
##        review_title      review_content            img_link        product_link 
##                   0                   0                   0                   0

Data Type Conversion of columns needed for analysis

# Convert relevant columns to appropriate data types
data_clean <- data_clean %>%
  mutate(
    product_id = as.factor(product_id),
    category = as.factor(category),
    user_id = as.factor(user_id),
    review_id = as.factor(review_id),
    
    discounted_price = as.numeric(gsub("[^0-9.]", "", discounted_price)),
    actual_price = as.numeric(gsub("[^0-9.]", "", actual_price)),
    discount_percentage = as.numeric(gsub("[^0-9.]", "", discount_percentage)),
    rating = as.numeric(rating),
    
    rating_count = as.integer(rating_count),
    product_name = as.character(product_name),
    about_product = as.character(about_product),
    user_name = as.character(user_name),
    review_title = as.character(review_title),
    review_content = as.character(review_content),
    img_link = as.character(img_link),
    product_link = as.character(product_link),
    profit_margin = actual_price - discounted_price)

# View the cleaned structure
glimpse(data_clean)
## Rows: 1,462
## Columns: 17
## $ product_id          <fct> B07JW9H4J1, B098NS6PVG, B096MSW6CT, B08HDJ86NZ, B0…
## $ product_name        <chr> "Wayona Nylon Braided USB to Lightning Fast Chargi…
## $ category            <fct> "Computers&Accessories|Accessories&Peripherals|Cab…
## $ discounted_price    <dbl> 399.00, 199.00, 199.00, 329.00, 154.00, 149.00, 17…
## $ actual_price        <dbl> 1099, 349, 1899, 699, 399, 1000, 499, 299, 999, 29…
## $ discount_percentage <dbl> 64, 43, 90, 53, 61, 85, 65, 23, 50, 33, 55, 63, 69…
## $ rating              <dbl> 4.2, 4.0, 3.9, 4.2, 4.2, 3.9, 4.1, 4.3, 4.2, 4.0, …
## $ rating_count        <int> 24269, 43994, 7928, 94363, 16905, 24871, 15188, 30…
## $ about_product       <chr> "High Compatibility : Compatible With iPhone 12, 1…
## $ user_id             <fct> "AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBBSNL…
## $ user_name           <chr> "Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jaspree…
## $ review_id           <fct> "R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1KD19…
## $ review_title        <chr> "Satisfied,Charging is really fast,Value for money…
## $ review_content      <chr> "Looks durable Charging is fine tooNo complains,Ch…
## $ img_link            <chr> "https://m.media-amazon.com/images/W/WEBP_402378-T…
## $ product_link        <chr> "https://www.amazon.in/Wayona-Braided-WN3LG1-Synci…
## $ profit_margin       <dbl> 700.00, 150.00, 1700.00, 370.00, 245.00, 851.00, 3…

Exploratory Data Analysis Key Sales Metrics

# Add a sales_amount column
data_clean <- data_clean %>%
  mutate(sales_amount = discounted_price)

# Calculate Key Metrics
# Total revenue
total_revenue <- sum(data_clean$sales_amount, na.rm = TRUE)

# Total number of transactions
total_transactions <- nrow(data_clean)

# Average and median sales values
avg_sales <- mean(data_clean$sales_amount, na.rm = TRUE)
median_sales <- median(data_clean$sales_amount, na.rm = TRUE)

# Display Metrics
key_metrics <- list(
  total_revenue = total_revenue,
  total_transactions = total_transactions,
  avg_sales = avg_sales,
  median_sales = median_sales
)
print(key_metrics)
## $total_revenue
## [1] 4576033
## 
## $total_transactions
## [1] 1462
## 
## $avg_sales
## [1] 3129.982
## 
## $median_sales
## [1] 799

Which Categories Generate the Most Revenue?

category_revenue <- data_clean %>%
  group_by(category) %>%
  summarise(
    total_revenue = sum(sales_amount, na.rm = TRUE),
    avg_revenue = mean(sales_amount, na.rm = TRUE)
  ) %>%
  arrange(desc(total_revenue))

# View Category Results
print(category_revenue)
## # A tibble: 211 × 3
##    category                                            total_revenue avg_revenue
##    <fct>                                                       <dbl>       <dbl>
##  1 Electronics|HomeTheater,TV&Video|Televisions|Smart…      1564932       24840.
##  2 Electronics|Mobiles&Accessories|Smartphones&BasicM…      1071302       15754.
##  3 Electronics|WearableTechnology|SmartWatches               177817        2340.
##  4 Home&Kitchen|Kitchen&HomeAppliances|WaterPurifiers…        84183        7015.
##  5 Computers&Accessories|Accessories&Peripherals|Cabl…        83601.        362.
##  6 Home&Kitchen|Kitchen&HomeAppliances|SmallKitchenAp…        81127.       3005.
##  7 Home&Kitchen|Heating,Cooling&AirQuality|WaterHeate…        75880        6323.
##  8 Home&Kitchen|Heating,Cooling&AirQuality|WaterHeate…        55250        2402.
##  9 Electronics|Headphones,Earbuds&Accessories|Headpho…        50761         976.
## 10 Home&Kitchen|Heating,Cooling&AirQuality|RoomHeater…        49403        2470.
## # ℹ 201 more rows

What are the top-selling and least-selling products

# Adding short product names for better visual output
data_clean <- data_clean %>%
  mutate(short_product_name = str_extract(product_name, "^([^\\s]+\\s+){0,2}[^\\s]+"))

# Group and summarize products
top_products <- data_clean %>%
  group_by(short_product_name) %>%
  summarise(
    total_revenue = sum(sales_amount, na.rm = TRUE),
    total_quantity = sum(rating_count, na.rm = TRUE)
  ) %>%
  arrange(desc(total_revenue))

kable(top_products %>% head(10), caption = "Top 10 Products by Revenue")
Top 10 Products by Revenue
short_product_name total_revenue total_quantity
Redmi Note 11 115993 280560
Samsung Galaxy M13 103992 151984
Samsung 138 cm 93989 14218
OnePlus Nord 2T 91997 52245
Sony Bravia 164 77990 5935
Acer 139 cm 68998 6314
Samsung 108 cm 63980 14218
OnePlus 163.8 cm 61999 6753
iQOO Z6 Pro 61997 28497
iQOO Z6 44W 58996 77009
kable(top_products %>% tail(10), caption = "Bottom 10 Products by Revenue")
Bottom 10 Products by Revenue
short_product_name total_revenue total_quantity
STRIFF Wall Mount 89.00 9340
VR 18 Pcs 89.00 19621
pTron Solero M241 89.00 1075
Empty Mist Trigger 85.00 212
Kitchenwell 18Pc Plastic 79.00 97
Gizga Essentials Webcam 69.00 255
Classmate Pulse Spiral 67.00 1269
FLiX (Beetel Flow 57.89 9378
GIZGA essentials Universal 39.00 15233
Inventis 5V 1.2W 39.00 13572

Analyze total sales and revenue by product category

# Analyze total sales and revenue by product category
category_performance <- data_clean %>%
  separate(category, into = c("main_category", "sub_category"), sep = "\\|", extra = "merge", fill = "right") %>%
  group_by(main_category) %>%
  summarise(
    total_revenue = sum(sales_amount, na.rm = TRUE),
    total_sales_volume = sum(rating_count, na.rm = TRUE)
  ) %>%
  arrange(desc(total_revenue))

# Print the results
print(category_performance)
## # A tibble: 9 × 3
##   main_category         total_revenue total_sales_volume
##   <chr>                         <dbl>              <int>
## 1 Electronics                3138057            15778848
## 2 Home&Kitchen               1042017.            2990077
## 3 Computers&Accessories       381273.            7728689
## 4 OfficeProducts                9349              149675
## 5 Car&Motorbike                 2339                1118
## 6 MusicalInstruments            1276               88882
## 7 Health&PersonalCare            899                3663
## 8 HomeImprovement                674                8566
## 9 Toys&Games                     150               15867

Customer Analysis Who are the top customers based on revenue or sales volume?

# Analyze customer revenue
customer_analysis <- data_clean %>%
  mutate(first_name = str_extract(user_name, "^[^, ]+")) %>%
  group_by(first_name) %>%
  summarise(
    total_revenue = sum(sales_amount * rating_count, na.rm = TRUE),
    total_sales_volume = sum(rating_count, na.rm = TRUE)
  ) %>%
  arrange(desc(total_revenue))

kable(customer_analysis %>% head(10), caption = "Top 10 Customers by Revenue")
Top 10 Customers by Revenue
first_name total_revenue total_sales_volume
Roshan 9099932664 1255336
prateeq 3980749049 180951
Amazon 3729471981 511369
Manoj 3288714222 167822
Meghnad 2639940912 203088
Satheesh 1975640016 151984
ATHARVA 1954239303 104697
siddharth 1732579987 115513
Indro 1652658528 77472
Kapil 1622826534 95466

Are there repeat customers?

# Add a new column with only the first name extracted from `user_name`
data_clean <- data_clean %>%
  mutate(first_name = str_extract(user_name, "^[^, ]+"))

repeat_customers <- data_clean %>%
  group_by(first_name) %>%
  summarise(transaction_count = n()) %>%
  filter(transaction_count > 1)

# View repeat customers
print(repeat_customers)
## # A tibble: 214 × 2
##    first_name     transaction_count
##    <chr>                      <int>
##  1 "$@|\\|TO$|-|"                10
##  2 "A"                            2
##  3 "ABHISHEK"                     2
##  4 "ANURAG"                       2
##  5 "ASR"                          3
##  6 "ATHARVA"                      3
##  7 "AV"                           6
##  8 "Abhay"                        4
##  9 "Abhishek"                     5
## 10 "Actual"                       5
## # ℹ 204 more rows
# Segment customers based on purchase volume
customer_segments <- data_clean %>%
  group_by(user_id) %>%
  summarise(
    total_purchases = sum(rating_count, na.rm = TRUE),
    total_spent = sum(sales_amount, na.rm = TRUE)
  ) %>%
  mutate(
    customer_segment = case_when(
      total_purchases <= 5 ~ "Low Purchaser",
      total_purchases > 5 & total_purchases <= 15 ~ "Medium Purchaser",
      total_purchases > 15 ~ "High Purchaser"
    )
  ) %>%
  group_by(customer_segment) %>%
  summarise(
    avg_spent_per_customer = mean(total_spent, na.rm = TRUE),
    total_customers = n()
  )

# Print the results
print(customer_segments)
## # A tibble: 3 × 3
##   customer_segment avg_spent_per_customer total_customers
##   <chr>                             <dbl>           <int>
## 1 High Purchaser                    3877.            1174
## 2 Low Purchaser                      762.               6
## 3 Medium Purchaser                  1786.              11

Discount and Promotion Analysis Correlation Between Discounts and Sales Volume

correlation <- cor(data_clean$discount_percentage, data_clean$rating_count, use = "complete.obs")
cat("Correlation between discount percentage and sales volume is:", correlation)
## Correlation between discount percentage and sales volume is: 0.01129439

Discount Range Analysis

discount_analysis <- data_clean %>%
  mutate(
    discount_range = case_when(
      discount_percentage >= 0 & discount_percentage < 20 ~ "0-20%",
      discount_percentage >= 20 & discount_percentage < 40 ~ "20-40%",
      discount_percentage >= 40 & discount_percentage < 60 ~ "40-60%",
      discount_percentage >= 60 ~ "60%+"
    )
  ) %>%
  group_by(discount_range) %>%
  summarise(
    total_sales_volume = sum(rating_count, na.rm = TRUE),
    total_revenue = sum(sales_amount, na.rm = TRUE),
    avg_rating = mean(rating, na.rm = TRUE)
  )

# Create a table with proper spacing between the title and content
discount_analysis %>%
  kable(caption = "Discount Range Analysis") %>%
  kable_styling(latex_options = c("hold_position"), font_size = 12) %>%
  row_spec(0, extra_css = "padding-bottom: 15px;")  # Adds space below the header row
Discount Range Analysis
discount_range total_sales_volume total_revenue avg_rating
0-20% 2467316 643894.0 4.168485
20-40% 6331403 2112311.9 4.134568
40-60% 7764342 1383549.3 4.089583
60%+ 10202324 436278.2 4.054767

Grouping products by rating range

rating_analysis <- data_clean %>%
  mutate(
    rating_range = case_when(
      rating >= 0 & rating < 3 ~ "Low (0-3)",
      rating >= 3 & rating < 4 ~ "Medium (3-4)",
      rating >= 4 ~ "High (4+)"
    )
  ) %>%
  group_by(rating_range) %>%
  summarise(
    total_sales_volume = sum(rating_count, na.rm = TRUE),
    total_revenue = sum(sales_amount, na.rm = TRUE)
  )

# Print the results
print(rating_analysis)
## # A tibble: 3 × 3
##   rating_range total_sales_volume total_revenue
##   <chr>                     <int>         <dbl>
## 1 High (4+)              23522400      3977642.
## 2 Low (0-3)                   394         3108 
## 3 Medium (3-4)            3242591       595283.