knitr::opts_chunk$set(echo = TRUE)
Introduction This analysis explores an e-commerce dataset to uncover insights related to revenue, sales transactions, customer behavior, product performance, and more. The dataset includes information on product pricing, categories, reviews, and discounts.
# Load required libraries
library(readxl)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(ggplot2)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
Load The Dataset
data <- read_csv("C:/Users/USER/Downloads/datasets/amazon sales/amazon.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 1465 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): product_id, product_name, category, discounted_price, actual_price...
## dbl (1): rating
## num (1): rating_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View structure and summary
glimpse(data)
## Rows: 1,465
## Columns: 16
## $ product_id <chr> "B07JW9H4J1", "B098NS6PVG", "B096MSW6CT", "B08HDJ8…
## $ product_name <chr> "Wayona Nylon Braided USB to Lightning Fast Chargi…
## $ category <chr> "Computers&Accessories|Accessories&Peripherals|Cab…
## $ discounted_price <chr> "₹399", "₹199", "₹199", "₹329", "₹154", "₹149", "₹…
## $ actual_price <chr> "₹1,099", "₹349", "₹1,899", "₹699", "₹399", "₹1,00…
## $ discount_percentage <chr> "64%", "43%", "90%", "53%", "61%", "85%", "65%", "…
## $ rating <dbl> 4.2, 4.0, 3.9, 4.2, 4.2, 3.9, 4.1, 4.3, 4.2, 4.0, …
## $ rating_count <dbl> 24269, 43994, 7928, 94363, 16905, 24871, 15188, 30…
## $ about_product <chr> "High Compatibility : Compatible With iPhone 12, 1…
## $ user_id <chr> "AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBBSNL…
## $ user_name <chr> "Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jaspree…
## $ review_id <chr> "R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1KD19…
## $ review_title <chr> "Satisfied,Charging is really fast,Value for money…
## $ review_content <chr> "Looks durable Charging is fine tooNo complains,Ch…
## $ img_link <chr> "https://m.media-amazon.com/images/W/WEBP_402378-T…
## $ product_link <chr> "https://www.amazon.in/Wayona-Braided-WN3LG1-Synci…
summary(data)
## product_id product_name category discounted_price
## Length:1465 Length:1465 Length:1465 Length:1465
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## actual_price discount_percentage rating rating_count
## Length:1465 Length:1465 Min. :2.000 Min. : 2
## Class :character Class :character 1st Qu.:4.000 1st Qu.: 1186
## Mode :character Mode :character Median :4.100 Median : 5179
## Mean :4.097 Mean : 18296
## 3rd Qu.:4.300 3rd Qu.: 17337
## Max. :5.000 Max. :426973
## NA's :1 NA's :2
## about_product user_id user_name review_id
## Length:1465 Length:1465 Length:1465 Length:1465
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## review_title review_content img_link product_link
## Length:1465 Length:1465 Length:1465 Length:1465
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
Data Cleaning and Handling Missing Values
# Check for missing values
colSums(is.na(data))
## product_id product_name category discounted_price
## 0 0 0 0
## actual_price discount_percentage rating rating_count
## 0 0 1 2
## about_product user_id user_name review_id
## 0 0 0 0
## review_title review_content img_link product_link
## 0 0 0 0
# Remove rows with missing values
data_clean <- data %>%
drop_na()
# Check updated dataset
dim(data_clean)
## [1] 1462 16
colSums(is.na(data_clean))
## product_id product_name category discounted_price
## 0 0 0 0
## actual_price discount_percentage rating rating_count
## 0 0 0 0
## about_product user_id user_name review_id
## 0 0 0 0
## review_title review_content img_link product_link
## 0 0 0 0
Data Type Conversion of columns needed for analysis
# Convert relevant columns to appropriate data types
data_clean <- data_clean %>%
mutate(
product_id = as.factor(product_id),
category = as.factor(category),
user_id = as.factor(user_id),
review_id = as.factor(review_id),
discounted_price = as.numeric(gsub("[^0-9.]", "", discounted_price)),
actual_price = as.numeric(gsub("[^0-9.]", "", actual_price)),
discount_percentage = as.numeric(gsub("[^0-9.]", "", discount_percentage)),
rating = as.numeric(rating),
rating_count = as.integer(rating_count),
product_name = as.character(product_name),
about_product = as.character(about_product),
user_name = as.character(user_name),
review_title = as.character(review_title),
review_content = as.character(review_content),
img_link = as.character(img_link),
product_link = as.character(product_link),
profit_margin = actual_price - discounted_price)
# View the cleaned structure
glimpse(data_clean)
## Rows: 1,462
## Columns: 17
## $ product_id <fct> B07JW9H4J1, B098NS6PVG, B096MSW6CT, B08HDJ86NZ, B0…
## $ product_name <chr> "Wayona Nylon Braided USB to Lightning Fast Chargi…
## $ category <fct> "Computers&Accessories|Accessories&Peripherals|Cab…
## $ discounted_price <dbl> 399.00, 199.00, 199.00, 329.00, 154.00, 149.00, 17…
## $ actual_price <dbl> 1099, 349, 1899, 699, 399, 1000, 499, 299, 999, 29…
## $ discount_percentage <dbl> 64, 43, 90, 53, 61, 85, 65, 23, 50, 33, 55, 63, 69…
## $ rating <dbl> 4.2, 4.0, 3.9, 4.2, 4.2, 3.9, 4.1, 4.3, 4.2, 4.0, …
## $ rating_count <int> 24269, 43994, 7928, 94363, 16905, 24871, 15188, 30…
## $ about_product <chr> "High Compatibility : Compatible With iPhone 12, 1…
## $ user_id <fct> "AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBBSNL…
## $ user_name <chr> "Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jaspree…
## $ review_id <fct> "R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1KD19…
## $ review_title <chr> "Satisfied,Charging is really fast,Value for money…
## $ review_content <chr> "Looks durable Charging is fine tooNo complains,Ch…
## $ img_link <chr> "https://m.media-amazon.com/images/W/WEBP_402378-T…
## $ product_link <chr> "https://www.amazon.in/Wayona-Braided-WN3LG1-Synci…
## $ profit_margin <dbl> 700.00, 150.00, 1700.00, 370.00, 245.00, 851.00, 3…
Exploratory Data Analysis Key Sales Metrics
# Add a sales_amount column
data_clean <- data_clean %>%
mutate(sales_amount = discounted_price)
# Calculate Key Metrics
# Total revenue
total_revenue <- sum(data_clean$sales_amount, na.rm = TRUE)
# Total number of transactions
total_transactions <- nrow(data_clean)
# Average and median sales values
avg_sales <- mean(data_clean$sales_amount, na.rm = TRUE)
median_sales <- median(data_clean$sales_amount, na.rm = TRUE)
# Display Metrics
key_metrics <- list(
total_revenue = total_revenue,
total_transactions = total_transactions,
avg_sales = avg_sales,
median_sales = median_sales
)
print(key_metrics)
## $total_revenue
## [1] 4576033
##
## $total_transactions
## [1] 1462
##
## $avg_sales
## [1] 3129.982
##
## $median_sales
## [1] 799
Which Categories Generate the Most Revenue?
category_revenue <- data_clean %>%
group_by(category) %>%
summarise(
total_revenue = sum(sales_amount, na.rm = TRUE),
avg_revenue = mean(sales_amount, na.rm = TRUE)
) %>%
arrange(desc(total_revenue))
# View Category Results
print(category_revenue)
## # A tibble: 211 × 3
## category total_revenue avg_revenue
## <fct> <dbl> <dbl>
## 1 Electronics|HomeTheater,TV&Video|Televisions|Smart… 1564932 24840.
## 2 Electronics|Mobiles&Accessories|Smartphones&BasicM… 1071302 15754.
## 3 Electronics|WearableTechnology|SmartWatches 177817 2340.
## 4 Home&Kitchen|Kitchen&HomeAppliances|WaterPurifiers… 84183 7015.
## 5 Computers&Accessories|Accessories&Peripherals|Cabl… 83601. 362.
## 6 Home&Kitchen|Kitchen&HomeAppliances|SmallKitchenAp… 81127. 3005.
## 7 Home&Kitchen|Heating,Cooling&AirQuality|WaterHeate… 75880 6323.
## 8 Home&Kitchen|Heating,Cooling&AirQuality|WaterHeate… 55250 2402.
## 9 Electronics|Headphones,Earbuds&Accessories|Headpho… 50761 976.
## 10 Home&Kitchen|Heating,Cooling&AirQuality|RoomHeater… 49403 2470.
## # ℹ 201 more rows
What are the top-selling and least-selling products
# Adding short product names for better visual output
data_clean <- data_clean %>%
mutate(short_product_name = str_extract(product_name, "^([^\\s]+\\s+){0,2}[^\\s]+"))
# Group and summarize products
top_products <- data_clean %>%
group_by(short_product_name) %>%
summarise(
total_revenue = sum(sales_amount, na.rm = TRUE),
total_quantity = sum(rating_count, na.rm = TRUE)
) %>%
arrange(desc(total_revenue))
kable(top_products %>% head(10), caption = "Top 10 Products by Revenue")
| short_product_name | total_revenue | total_quantity |
|---|---|---|
| Redmi Note 11 | 115993 | 280560 |
| Samsung Galaxy M13 | 103992 | 151984 |
| Samsung 138 cm | 93989 | 14218 |
| OnePlus Nord 2T | 91997 | 52245 |
| Sony Bravia 164 | 77990 | 5935 |
| Acer 139 cm | 68998 | 6314 |
| Samsung 108 cm | 63980 | 14218 |
| OnePlus 163.8 cm | 61999 | 6753 |
| iQOO Z6 Pro | 61997 | 28497 |
| iQOO Z6 44W | 58996 | 77009 |
kable(top_products %>% tail(10), caption = "Bottom 10 Products by Revenue")
| short_product_name | total_revenue | total_quantity |
|---|---|---|
| STRIFF Wall Mount | 89.00 | 9340 |
| VR 18 Pcs | 89.00 | 19621 |
| pTron Solero M241 | 89.00 | 1075 |
| Empty Mist Trigger | 85.00 | 212 |
| Kitchenwell 18Pc Plastic | 79.00 | 97 |
| Gizga Essentials Webcam | 69.00 | 255 |
| Classmate Pulse Spiral | 67.00 | 1269 |
| FLiX (Beetel Flow | 57.89 | 9378 |
| GIZGA essentials Universal | 39.00 | 15233 |
| Inventis 5V 1.2W | 39.00 | 13572 |
Analyze total sales and revenue by product category
# Analyze total sales and revenue by product category
category_performance <- data_clean %>%
separate(category, into = c("main_category", "sub_category"), sep = "\\|", extra = "merge", fill = "right") %>%
group_by(main_category) %>%
summarise(
total_revenue = sum(sales_amount, na.rm = TRUE),
total_sales_volume = sum(rating_count, na.rm = TRUE)
) %>%
arrange(desc(total_revenue))
# Print the results
print(category_performance)
## # A tibble: 9 × 3
## main_category total_revenue total_sales_volume
## <chr> <dbl> <int>
## 1 Electronics 3138057 15778848
## 2 Home&Kitchen 1042017. 2990077
## 3 Computers&Accessories 381273. 7728689
## 4 OfficeProducts 9349 149675
## 5 Car&Motorbike 2339 1118
## 6 MusicalInstruments 1276 88882
## 7 Health&PersonalCare 899 3663
## 8 HomeImprovement 674 8566
## 9 Toys&Games 150 15867
Customer Analysis Who are the top customers based on revenue or sales volume?
# Analyze customer revenue
customer_analysis <- data_clean %>%
mutate(first_name = str_extract(user_name, "^[^, ]+")) %>%
group_by(first_name) %>%
summarise(
total_revenue = sum(sales_amount * rating_count, na.rm = TRUE),
total_sales_volume = sum(rating_count, na.rm = TRUE)
) %>%
arrange(desc(total_revenue))
kable(customer_analysis %>% head(10), caption = "Top 10 Customers by Revenue")
| first_name | total_revenue | total_sales_volume |
|---|---|---|
| Roshan | 9099932664 | 1255336 |
| prateeq | 3980749049 | 180951 |
| Amazon | 3729471981 | 511369 |
| Manoj | 3288714222 | 167822 |
| Meghnad | 2639940912 | 203088 |
| Satheesh | 1975640016 | 151984 |
| ATHARVA | 1954239303 | 104697 |
| siddharth | 1732579987 | 115513 |
| Indro | 1652658528 | 77472 |
| Kapil | 1622826534 | 95466 |
Are there repeat customers?
# Add a new column with only the first name extracted from `user_name`
data_clean <- data_clean %>%
mutate(first_name = str_extract(user_name, "^[^, ]+"))
repeat_customers <- data_clean %>%
group_by(first_name) %>%
summarise(transaction_count = n()) %>%
filter(transaction_count > 1)
# View repeat customers
print(repeat_customers)
## # A tibble: 214 × 2
## first_name transaction_count
## <chr> <int>
## 1 "$@|\\|TO$|-|" 10
## 2 "A" 2
## 3 "ABHISHEK" 2
## 4 "ANURAG" 2
## 5 "ASR" 3
## 6 "ATHARVA" 3
## 7 "AV" 6
## 8 "Abhay" 4
## 9 "Abhishek" 5
## 10 "Actual" 5
## # ℹ 204 more rows
# Segment customers based on purchase volume
customer_segments <- data_clean %>%
group_by(user_id) %>%
summarise(
total_purchases = sum(rating_count, na.rm = TRUE),
total_spent = sum(sales_amount, na.rm = TRUE)
) %>%
mutate(
customer_segment = case_when(
total_purchases <= 5 ~ "Low Purchaser",
total_purchases > 5 & total_purchases <= 15 ~ "Medium Purchaser",
total_purchases > 15 ~ "High Purchaser"
)
) %>%
group_by(customer_segment) %>%
summarise(
avg_spent_per_customer = mean(total_spent, na.rm = TRUE),
total_customers = n()
)
# Print the results
print(customer_segments)
## # A tibble: 3 × 3
## customer_segment avg_spent_per_customer total_customers
## <chr> <dbl> <int>
## 1 High Purchaser 3877. 1174
## 2 Low Purchaser 762. 6
## 3 Medium Purchaser 1786. 11
Discount and Promotion Analysis Correlation Between Discounts and Sales Volume
correlation <- cor(data_clean$discount_percentage, data_clean$rating_count, use = "complete.obs")
cat("Correlation between discount percentage and sales volume is:", correlation)
## Correlation between discount percentage and sales volume is: 0.01129439
Discount Range Analysis
discount_analysis <- data_clean %>%
mutate(
discount_range = case_when(
discount_percentage >= 0 & discount_percentage < 20 ~ "0-20%",
discount_percentage >= 20 & discount_percentage < 40 ~ "20-40%",
discount_percentage >= 40 & discount_percentage < 60 ~ "40-60%",
discount_percentage >= 60 ~ "60%+"
)
) %>%
group_by(discount_range) %>%
summarise(
total_sales_volume = sum(rating_count, na.rm = TRUE),
total_revenue = sum(sales_amount, na.rm = TRUE),
avg_rating = mean(rating, na.rm = TRUE)
)
# Create a table with proper spacing between the title and content
discount_analysis %>%
kable(caption = "Discount Range Analysis") %>%
kable_styling(latex_options = c("hold_position"), font_size = 12) %>%
row_spec(0, extra_css = "padding-bottom: 15px;") # Adds space below the header row
| discount_range | total_sales_volume | total_revenue | avg_rating |
|---|---|---|---|
| 0-20% | 2467316 | 643894.0 | 4.168485 |
| 20-40% | 6331403 | 2112311.9 | 4.134568 |
| 40-60% | 7764342 | 1383549.3 | 4.089583 |
| 60%+ | 10202324 | 436278.2 | 4.054767 |
Grouping products by rating range
rating_analysis <- data_clean %>%
mutate(
rating_range = case_when(
rating >= 0 & rating < 3 ~ "Low (0-3)",
rating >= 3 & rating < 4 ~ "Medium (3-4)",
rating >= 4 ~ "High (4+)"
)
) %>%
group_by(rating_range) %>%
summarise(
total_sales_volume = sum(rating_count, na.rm = TRUE),
total_revenue = sum(sales_amount, na.rm = TRUE)
)
# Print the results
print(rating_analysis)
## # A tibble: 3 × 3
## rating_range total_sales_volume total_revenue
## <chr> <int> <dbl>
## 1 High (4+) 23522400 3977642.
## 2 Low (0-3) 394 3108
## 3 Medium (3-4) 3242591 595283.