Introduction

This project analyzes Amazon product data to understand patterns in pricing, discounts, ratings, and customer reviews. The goal is to identify factors associated with customer satisfaction and product popularity using data visualization techniques in R.

Load Libraries

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(viridis)
## Warning: package 'viridis' was built under R version 4.5.3
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## 
## The following object is masked from 'package:scales':
## 
##     viridis_pal

Import Dataset

amazon <- read.csv("amazon.csv")

Data Preparation

amazon <- amazon %>%
  mutate(
    actual_price_num =
      as.numeric(gsub("[₹,]", "", actual_price)),

    discounted_price_num =
      as.numeric(gsub("[₹,]", "", discounted_price)),

    discount_percentage_num =
      as.numeric(gsub("[%]", "", discount_percentage)),

    rating = as.numeric(rating),

    rating_count_num =
      as.numeric(gsub(",", "", rating_count))
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `rating = as.numeric(rating)`.
## Caused by warning:
## ! NAs introduced by coercion

Visualization 1: Top Categories by Average Rating

amazon %>%
  group_by(category) %>%
  summarise(avg_rating = mean(rating, na.rm = TRUE)) %>%
  slice_max(avg_rating, n = 10) %>%
  ggplot(aes(reorder(category, avg_rating), avg_rating)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Top Categories by Average Rating",
    x = "Category",
    y = "Average Rating"
  )

Insight

Categories shown at the top consistently receive the highest customer ratings.

Visualization 2: Distribution of Ratings

ggplot(amazon, aes(rating)) +
  geom_histogram(
    binwidth = 0.1,
    fill = "orange",
    color = "white"
  ) +
  labs(
    title = "Distribution of Product Ratings",
    x = "Rating",
    y = "Count"
  )
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Insight

Most products receive ratings above 4 stars.

Visualization 3: Discount Percentage by Category

amazon %>%
  ggplot(
    aes(
      x = reorder(category,
                  discount_percentage_num,
                  median),
      y = discount_percentage_num
    )
  ) +
  geom_boxplot(fill = "purple") +
  coord_flip() +
  labs(
    title = "Discount Percentage by Category",
    x = "Category",
    y = "Discount (%)"
  )

Insight

Some categories offer significantly larger discounts than others.

Visualization 4: Rating vs Discount

ggplot(
  amazon,
  aes(
    x = discount_percentage_num,
    y = rating
  )
) +
  geom_point(alpha = 0.5,
             color = "darkgreen") +
  geom_smooth(method = "lm",
              color = "red") +
  labs(
    title = "Relationship Between Rating and Discount",
    x = "Discount (%)",
    y = "Rating"
  )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Insight

The trend line helps determine whether larger discounts influence ratings.

Visualization 5: Average Price by Category

amazon %>%
  group_by(category) %>%
  summarise(
    avg_price =
      mean(actual_price_num, na.rm = TRUE)
  ) %>%
  slice_max(avg_price, n = 10) %>%
  ggplot(
    aes(
      reorder(category, avg_price),
      avg_price
    )
  ) +
  geom_col(fill = "tomato") +
  coord_flip() +
  scale_y_continuous(
    labels = comma
  ) +
  labs(
    title = "Average Price by Category",
    x = "Category",
    y = "Average Price (₹)"
  )

Insight

Certain categories contain substantially higher-priced products.

Visualization 6: Heatmap of Ratings and Discounts

amazon %>%
  mutate(
    rating_group =
      cut(rating, breaks = 5),

    discount_group =
      cut(
        discount_percentage_num,
        breaks = 5
      )
  ) %>%
  count(
    rating_group,
    discount_group
  ) %>%
  ggplot(
    aes(
      rating_group,
      discount_group,
      fill = n
    )
  ) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(
    title = "Heatmap of Ratings and Discounts",
    x = "Rating Group",
    y = "Discount Group"
  )

Insight

The heatmap highlights the most common rating-discount combinations.

Visualization 7: Top Reviewed Products

amazon %>%
  arrange(desc(rating_count_num)) %>%
  slice(1:10) %>%
  ggplot(
    aes(
      reorder(
        product_name,
        rating_count_num
      ),
      rating_count_num
    )
  ) +
  geom_segment(
    aes(
      xend = product_name,
      y = 0,
      yend = rating_count_num
    ),
    color = "gray"
  ) +
  geom_point(
    size = 4,
    color = "blue"
  ) +
  coord_flip() +
  labs(
    title = "Top Reviewed Products",
    x = "Product",
    y = "Review Count"
  )

Insight

These products have the highest customer engagement.

Visualization 8: Interactive Plotly Scatter Plot

p <- ggplot(
  amazon,
  aes(
    x = actual_price_num,
    y = rating,
    text = product_name,
    color = category
  )
) +
  geom_point(alpha = 0.7) +
  labs(
    title = "Price vs Rating",
    x = "Price (₹)",
    y = "Rating"
  )

ggplotly(p)

Insight

Users can interactively explore products by hovering over points.

Conclusion

This analysis examined pricing, discounts, ratings, and customer reviews in Amazon products. The visualizations revealed differences among categories, customer preferences, and pricing strategies. Interactive and static visualizations together provide a comprehensive view of product performance and customer engagement on Amazon.