This project analyzes Amazon product data to understand patterns in pricing, discounts, ratings, and customer reviews. The goal is to identify factors associated with customer satisfaction and product popularity using data visualization techniques in R.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(viridis)
## Warning: package 'viridis' was built under R version 4.5.3
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
##
## The following object is masked from 'package:scales':
##
## viridis_pal
amazon <- read.csv("amazon.csv")
amazon <- amazon %>%
mutate(
actual_price_num =
as.numeric(gsub("[₹,]", "", actual_price)),
discounted_price_num =
as.numeric(gsub("[₹,]", "", discounted_price)),
discount_percentage_num =
as.numeric(gsub("[%]", "", discount_percentage)),
rating = as.numeric(rating),
rating_count_num =
as.numeric(gsub(",", "", rating_count))
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `rating = as.numeric(rating)`.
## Caused by warning:
## ! NAs introduced by coercion
amazon %>%
group_by(category) %>%
summarise(avg_rating = mean(rating, na.rm = TRUE)) %>%
slice_max(avg_rating, n = 10) %>%
ggplot(aes(reorder(category, avg_rating), avg_rating)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Top Categories by Average Rating",
x = "Category",
y = "Average Rating"
)
Categories shown at the top consistently receive the highest customer ratings.
ggplot(amazon, aes(rating)) +
geom_histogram(
binwidth = 0.1,
fill = "orange",
color = "white"
) +
labs(
title = "Distribution of Product Ratings",
x = "Rating",
y = "Count"
)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
Most products receive ratings above 4 stars.
amazon %>%
ggplot(
aes(
x = reorder(category,
discount_percentage_num,
median),
y = discount_percentage_num
)
) +
geom_boxplot(fill = "purple") +
coord_flip() +
labs(
title = "Discount Percentage by Category",
x = "Category",
y = "Discount (%)"
)
Some categories offer significantly larger discounts than others.
ggplot(
amazon,
aes(
x = discount_percentage_num,
y = rating
)
) +
geom_point(alpha = 0.5,
color = "darkgreen") +
geom_smooth(method = "lm",
color = "red") +
labs(
title = "Relationship Between Rating and Discount",
x = "Discount (%)",
y = "Rating"
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
The trend line helps determine whether larger discounts influence ratings.
amazon %>%
group_by(category) %>%
summarise(
avg_price =
mean(actual_price_num, na.rm = TRUE)
) %>%
slice_max(avg_price, n = 10) %>%
ggplot(
aes(
reorder(category, avg_price),
avg_price
)
) +
geom_col(fill = "tomato") +
coord_flip() +
scale_y_continuous(
labels = comma
) +
labs(
title = "Average Price by Category",
x = "Category",
y = "Average Price (₹)"
)
Certain categories contain substantially higher-priced products.
amazon %>%
mutate(
rating_group =
cut(rating, breaks = 5),
discount_group =
cut(
discount_percentage_num,
breaks = 5
)
) %>%
count(
rating_group,
discount_group
) %>%
ggplot(
aes(
rating_group,
discount_group,
fill = n
)
) +
geom_tile() +
scale_fill_viridis_c() +
labs(
title = "Heatmap of Ratings and Discounts",
x = "Rating Group",
y = "Discount Group"
)
The heatmap highlights the most common rating-discount combinations.
amazon %>%
arrange(desc(rating_count_num)) %>%
slice(1:10) %>%
ggplot(
aes(
reorder(
product_name,
rating_count_num
),
rating_count_num
)
) +
geom_segment(
aes(
xend = product_name,
y = 0,
yend = rating_count_num
),
color = "gray"
) +
geom_point(
size = 4,
color = "blue"
) +
coord_flip() +
labs(
title = "Top Reviewed Products",
x = "Product",
y = "Review Count"
)
These products have the highest customer engagement.
p <- ggplot(
amazon,
aes(
x = actual_price_num,
y = rating,
text = product_name,
color = category
)
) +
geom_point(alpha = 0.7) +
labs(
title = "Price vs Rating",
x = "Price (₹)",
y = "Rating"
)
ggplotly(p)
Users can interactively explore products by hovering over points.
This analysis examined pricing, discounts, ratings, and customer reviews in Amazon products. The visualizations revealed differences among categories, customer preferences, and pricing strategies. Interactive and static visualizations together provide a comprehensive view of product performance and customer engagement on Amazon.