Introduction

The Supermarket Sales dataset contains detailed information about customer purchases, including product details, pricing, customer demographics, and sales metrics such as quantity, total amount, rating, and gross income. Using this dataset, I performed several data analysis tasks to better understand sales patterns and customer behavior.

First, I cleaned the data by converting important columns into numeric format and removing any missing values. After preparing the dataset, I explored different analytical techniques such as grouping, summarising, visualisation, regression analysis, K-means clustering, and KNN classification. These methods helped identify the most popular product lines, average ratings, relationships between price and total sales, and customer clustering patterns. Each visualisation and model was created to uncover meaningful insights that support business decision-making..

Libraries used

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'dplyr' was built under R version 4.5.2

## Warning: package 'lubridate' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyr)
library(lubridate)
library(readr)
library(dplyr)
library(ggplot2)
library(class)

## Warning: package 'class' was built under R version 4.5.2

DATASET

library(readr)
Supermarket_Sales <- read_csv("D:/SEM5/CAP 484/Supermarket_Sales.csv")

## Rows: 1000 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): Invoice ID, Branch, City, Customer type, Gender, Product line, Dat...
## dbl  (8): Unit price, Quantity, Tax 5%, Total, cogs, gross margin percentage...
## time (1): Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(Supermarket_Sales)

View structure

View(Supermarket_Sales)

TO CHECK THE COLUMNS NAMES

colnames(Supermarket_Sales)

##  [1] "Invoice ID"              "Branch"                 
##  [3] "City"                    "Customer type"          
##  [5] "Gender"                  "Product line"           
##  [7] "Unit price"              "Quantity"               
##  [9] "Tax 5%"                  "Total"                  
## [11] "Date"                    "Time"                   
## [13] "Payment"                 "cogs"                   
## [15] "gross margin percentage" "gross income"           
## [17] "Rating"

Data Cleaning

# Remove missing values (if any)
Supermarket_Sales <- Supermarket_Sales %>% drop_na()

# Missing

# Missing values check ---
missing_summary <- colSums(is.na(Supermarket_Sales))
print(missing_summary)

##              Invoice ID                  Branch                    City 
##                       0                       0                       0 
##           Customer type                  Gender            Product line 
##                       0                       0                       0 
##              Unit price                Quantity                  Tax 5% 
##                       0                       0                       0 
##                   Total                    Date                    Time 
##                       0                       0                       0 
##                 Payment                    cogs gross margin percentage 
##                       0                       0                       0 
##            gross income                  Rating 
##                       0                       0

# Average Rating And Summary 

# Average rating, gross income, and total sales by branch
summary_branch <- Supermarket_Sales %>%
  group_by(Branch) %>%
  summarise(
    Avg_Rating = mean(Rating, na.rm = TRUE),
    Total_Sales = sum(Total, na.rm = TRUE),
    Total_Income = sum(`gross income`, na.rm = TRUE),
    Transactions = n()
  ) %>%
  arrange(desc(Total_Sales))

summary_branch

## # A tibble: 3 × 5
##   Branch Avg_Rating Total_Sales Total_Income Transactions
##   <chr>       <dbl>       <dbl>        <dbl>        <int>
## 1 C            7.07     110569.        5265.          328
## 2 A            7.03     106200.        5057.          340
## 3 B            6.82     106198.        5057.          332

This summary calculates the average, avg Total sales values from the dataset.

# Sales In Desc order

#sort by total sales in desc order
Supermarket_Sales <- Supermarket_Sales[order(-Supermarket_Sales$Total), ]

These commands sort the dataset by the Total column, first in ascending order and then in descending order to show the smallest and highest sales amounts

# TO MAKE THE NAMES 

names(Supermarket_Sales) <- make.names(names(Supermarket_Sales))

#FILTER

Supermarket_Sales %>% 
  filter(Gender =="Female")

## # A tibble: 501 × 17
##    Invoice.ID Branch City  Customer.type Gender Product.line Unit.price Quantity
##    <chr>      <chr>  <chr> <chr>         <chr>  <chr>             <dbl>    <dbl>
##  1 860-79-08… C      Nayp… Member        Female Fashion acc…       99.3       10
##  2 283-26-52… C      Nayp… Member        Female Food and be…       98.5       10
##  3 303-96-22… B      Mand… Normal        Female Home and li…       97.4       10
##  4 744-16-78… B      Mand… Normal        Female Home and li…       97.4       10
##  5 271-88-87… C      Nayp… Member        Female Fashion acc…       97.2       10
##  6 554-42-24… C      Nayp… Normal        Female Sports and …       95.4       10
##  7 325-77-61… A      Yang… Member        Female Home and li…       90.6       10
##  8 731-81-94… C      Nayp… Member        Female Sports and …       89.8       10
##  9 817-69-82… B      Mand… Normal        Female Electronic …       99.7        9
## 10 277-35-58… C      Nayp… Member        Female Food and be…       99.0        9
## # ℹ 491 more rows
## # ℹ 9 more variables: Tax.5. <dbl>, Total <dbl>, Date <chr>, Time <time>,
## #   Payment <chr>, cogs <dbl>, gross.margin.percentage <dbl>,
## #   gross.income <dbl>, Rating <dbl>

This filter returns only the rows where the customer’s gender is Female, allowing us to analyze sales made specifically by female customers

#SUMMERIZE
Supermarket_Sales %>% 
  summarise(
    Avg_Sales = mean(Total),
    Max_Sales = max(Total),
    Min_Sales = min(Total)
)

## # A tibble: 1 × 3
##   Avg_Sales Max_Sales Min_Sales
##       <dbl>     <dbl>     <dbl>
## 1      323.     1043.      10.7

#GROUP BY

Supermarket_Sales %>%
  group_by(Branch) %>%
  summarise(
    Total_Revenue = sum(Total),
    Avg_Rating = mean(Rating)
)

## # A tibble: 3 × 3
##   Branch Total_Revenue Avg_Rating
##   <chr>          <dbl>      <dbl>
## 1 A            106200.       7.03
## 2 B            106198.       6.82
## 3 C            110569.       7.07

#SLICE

Supermarket_Sales %>% slice(1:5)     # first 5rows

## # A tibble: 5 × 17
##   Invoice.ID  Branch City  Customer.type Gender Product.line Unit.price Quantity
##   <chr>       <chr>  <chr> <chr>         <chr>  <chr>             <dbl>    <dbl>
## 1 860-79-0874 C      Nayp… Member        Female Fashion acc…       99.3       10
## 2 687-47-8271 A      Yang… Normal        Male   Fashion acc…       99.0       10
## 3 283-26-5248 C      Nayp… Member        Female Food and be…       98.5       10
## 4 751-41-9720 C      Nayp… Normal        Male   Home and li…       97.5       10
## 5 303-96-2227 B      Mand… Normal        Female Home and li…       97.4       10
## # ℹ 9 more variables: Tax.5. <dbl>, Total <dbl>, Date <chr>, Time <time>,
## #   Payment <chr>, cogs <dbl>, gross.margin.percentage <dbl>,
## #   gross.income <dbl>, Rating <dbl>

#UNITE

Supermarket_Sales %>%
  unite("City_Branch", City, Branch, sep = "-")

## # A tibble: 1,000 × 16
##    Invoice.ID  City_Branch Customer.type Gender Product.line Unit.price Quantity
##    <chr>       <chr>       <chr>         <chr>  <chr>             <dbl>    <dbl>
##  1 860-79-0874 Naypyitaw-C Member        Female Fashion acc…       99.3       10
##  2 687-47-8271 Yangon-A    Normal        Male   Fashion acc…       99.0       10
##  3 283-26-5248 Naypyitaw-C Member        Female Food and be…       98.5       10
##  4 751-41-9720 Naypyitaw-C Normal        Male   Home and li…       97.5       10
##  5 303-96-2227 Mandalay-B  Normal        Female Home and li…       97.4       10
##  6 744-16-7898 Mandalay-B  Normal        Female Home and li…       97.4       10
##  7 271-88-8734 Naypyitaw-C Member        Female Fashion acc…       97.2       10
##  8 234-65-2137 Naypyitaw-C Normal        Male   Home and li…       95.6       10
##  9 554-42-2417 Naypyitaw-C Normal        Female Sports and …       95.4       10
## 10 325-77-6186 Yangon-A    Member        Female Home and li…       90.6       10
## # ℹ 990 more rows
## # ℹ 9 more variables: Tax.5. <dbl>, Total <dbl>, Date <chr>, Time <time>,
## #   Payment <chr>, cogs <dbl>, gross.margin.percentage <dbl>,
## #   gross.income <dbl>, Rating <dbl>









``` r
#SPLIT
split(Supermarket_Sales, Supermarket_Sales$Branch)

## $A
## # A tibble: 340 × 17
##    Invoice.ID Branch City  Customer.type Gender Product.line Unit.price Quantity
##    <chr>      <chr>  <chr> <chr>         <chr>  <chr>             <dbl>    <dbl>
##  1 687-47-82… A      Yang… Normal        Male   Fashion acc…       99.0       10
##  2 325-77-61… A      Yang… Member        Female Home and li…       90.6       10
##  3 384-59-66… A      Yang… Member        Female Food and be…       98.7        9
##  4 704-48-39… A      Yang… Member        Male   Electronic …       88.7       10
##  5 827-77-76… A      Yang… Normal        Male   Sports and …       98.1        9
##  6 139-32-41… A      Yang… Member        Female Sports and …       97.5        9
##  7 805-86-02… A      Yang… Normal        Male   Home and li…       94.0        9
##  8 698-98-59… A      Yang… Normal        Female Food and be…       81.2       10
##  9 138-17-51… A      Yang… Member        Female Home and li…       89.2        9
## 10 638-60-71… A      Yang… Normal        Female Electronic …       99.6        8
## # ℹ 330 more rows
## # ℹ 9 more variables: Tax.5. <dbl>, Total <dbl>, Date <chr>, Time <time>,
## #   Payment <chr>, cogs <dbl>, gross.margin.percentage <dbl>,
## #   gross.income <dbl>, Rating <dbl>
## 
## $B
## # A tibble: 332 × 17
##    Invoice.ID Branch City  Customer.type Gender Product.line Unit.price Quantity
##    <chr>      <chr>  <chr> <chr>         <chr>  <chr>             <dbl>    <dbl>
##  1 303-96-22… B      Mand… Normal        Female Home and li…       97.4       10
##  2 744-16-78… B      Mand… Normal        Female Home and li…       97.4       10
##  3 219-22-93… B      Mand… Member        Male   Sports and …      100.         9
##  4 817-69-82… B      Mand… Normal        Female Electronic …       99.7        9
##  5 766-85-70… B      Mand… Normal        Male   Health and …       87.9       10
##  6 743-04-11… B      Mand… Member        Male   Health and …       97.2        9
##  7 746-04-10… B      Mand… Member        Female Food and be…       84.6       10
##  8 852-62-71… B      Mand… Normal        Female Fashion acc…       83.2       10
##  9 628-90-86… B      Mand… Member        Male   Health and …       82.6       10
## 10 549-84-74… B      Mand… Normal        Female Sports and …       90.3        9
## # ℹ 322 more rows
## # ℹ 9 more variables: Tax.5. <dbl>, Total <dbl>, Date <chr>, Time <time>,
## #   Payment <chr>, cogs <dbl>, gross.margin.percentage <dbl>,
## #   gross.income <dbl>, Rating <dbl>
## 
## $C
## # A tibble: 328 × 17
##    Invoice.ID Branch City  Customer.type Gender Product.line Unit.price Quantity
##    <chr>      <chr>  <chr> <chr>         <chr>  <chr>             <dbl>    <dbl>
##  1 860-79-08… C      Nayp… Member        Female Fashion acc…       99.3       10
##  2 283-26-52… C      Nayp… Member        Female Food and be…       98.5       10
##  3 751-41-97… C      Nayp… Normal        Male   Home and li…       97.5       10
##  4 271-88-87… C      Nayp… Member        Female Fashion acc…       97.2       10
##  5 234-65-21… C      Nayp… Normal        Male   Home and li…       95.6       10
##  6 554-42-24… C      Nayp… Normal        Female Sports and …       95.4       10
##  7 280-17-43… C      Nayp… Member        Male   Health and …       90.5       10
##  8 702-83-52… C      Nayp… Member        Male   Fashion acc…       99.8        9
##  9 731-81-94… C      Nayp… Member        Female Sports and …       89.8       10
## 10 393-65-27… C      Nayp… Normal        Male   Food and be…       89.5       10
## # ℹ 318 more rows
## # ℹ 9 more variables: Tax.5. <dbl>, Total <dbl>, Date <chr>, Time <time>,
## #   Payment <chr>, cogs <dbl>, gross.margin.percentage <dbl>,
## #   gross.income <dbl>, Rating <dbl>

PLOTS

#Which branch has the highest total sales
branch_sales <- Supermarket_Sales %>%
  group_by(Branch) %>%
  summarise(Total_Sales = sum(Total))

ggplot(branch_sales, aes(x=Branch, y=Total_Sales, fill=Branch)) +
  geom_bar(stat="identity") +
  ggtitle("Total Sales by Branch") +
  theme_minimal()

The bar chart shows that total sales vary across the three branches. One branch clearly generates the highest sales, followed by a moderately performing branch, while the third branch records the lowest revenue. This indicates differences in customer footfall, purchasing behavior, or branch performance.

SALES BY GENDER

#Gender-wise sales comparison
gender_sales <- Supermarket_Sales %>%
  group_by(Gender) %>%
  summarise(Total_Sales = sum(Total))

ggplot(gender_sales, aes(x=Gender, y=Total_Sales, fill=Gender)) +
  geom_bar(stat="identity") +
  ggtitle("Sales by Gender") +
  theme_minimal()

SALES DISTRIBUTION BY PRODUCT

# Sales distribution across product lines
ggplot(Supermarket_Sales, aes(x=Product.line, y=Total, fill=Product.line)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  ggtitle("Sales Distribution by ProductLine")

The boxplot shows how total sales vary across different product lines. From the distribution, some product lines have higher median sales and wider spread, indicating greater variability in purchase amounts. Other product categories display lower and more consistent sales values. This comparison highlights which product lines generate higher revenue per transaction and which ones have more stable but lower sales performance.

#Knn
#NORMALIZATON

# Normalization function
normalize <- function(x){
  return((x - min(x)) / (max(x) - min(x)))
}
numeric_cols <- sapply(Supermarket_Sales, is.numeric)

Supermarket_Sales_norm <- as.data.frame(lapply(Supermarket_Sales[, numeric_cols], normalize))
head(Supermarket_Sales_norm)

##   Unit.price Quantity    Tax.5.     Total      cogs gross.margin.percentage
## 1  0.9926569        1 1.0000000 1.0000000 1.0000000                     NaN
## 2  0.9890966        1 0.9967441 0.9967441 0.9967441                     NaN
## 3  0.9839786        1 0.9920637 0.9920637 0.9920637                     NaN
## 4  0.9726302        1 0.9816855 0.9816855 0.9816855                     NaN
## 5  0.9712951        1 0.9804646 0.9804646 0.9804646                     NaN
## 6  0.9711838        1 0.9803628 0.9803628 0.9803628                     NaN
##   gross.income     Rating
## 1    1.0000000 0.43333333
## 2    0.9967441 0.78333333
## 3    0.9920637 0.08333333
## 4    0.9816855 0.66666667
## 5    0.9804646 0.06666667
## 6    0.9803628 0.15000000

# NUMERIC FEATURES

data_numeric <- Supermarket_Sales[, c("Quantity", "Unit.price","Rating")]

# TARGET VARIABLE INTO CLASSIFICATION 

labels <- Supermarket_Sales$Branch

# TRAIN AND TEST DATA 

set.seed(123)
index <- sample(1:nrow(data_numeric), 0.7 * nrow(data_numeric))

train_data  <- data_numeric[index, ]
test_data   <- data_numeric[-index, ]
train_label <- labels[index]
test_label  <- labels[-index]

# BUILD Knn

library(class)

knn_pred <- knn(
  train = train_data,
  test = test_data,
  cl = train_label,
k=5
)
view(knn_pred)

This code builds a KNN model using k = 5 to predict branch labels for the test data based on the training data.

Confusion Matrix

confusion_matrix <- table(Predicted = knn_pred, Actual = test_label)
confusion_matrix

##          Actual
## Predicted  A  B  C
##         A 27 34 40
##         B 44 26 31
##         C 30 33 35

# ACCURACY

mean(knn_pred == test_label)

## [1] 0.2933333

# Combine test data with predicted labels
test_plot <- test_data
test_plot$Predicted_Branch <- knn_pred

# 2D Scatter plot: Unit Price vs Quantity colored by predicted branch
ggplot(test_plot, aes(x = Unit.price, y = Quantity, color = Predicted_Branch)) +
  geom_point(size = 3, alpha = 0.7) +
  ggtitle("KNN Predictions: Branch") +
  xlab("Unit Price") +
  ylab("Quantity") +
  theme_minimal() +
  scale_color_brewer(palette = "Set1")

# SELECT NUMERIC COLUMN
numeric_data <- Supermarket_Sales %>%
  select(`Unit.price`, Quantity, Total,Rating)

# SCALE DATA 

scaled_data <- scale(numeric_data)

This code standardizes all numeric features so they are on the same scale before applying K-Means.

# K-means

set.seed(123)

kmeans_model <- kmeans(scaled_data, centers = 3, nstart = 20)
kmeans_model

## K-means clustering with 3 clusters of sizes 318, 384, 298
## 
## Cluster means:
##    Unit.price   Quantity      Total       Rating
## 1 -0.93825443  0.5204319 -0.3829831  0.077915343
## 2  0.07649566 -1.0358478 -0.6905937 -0.009874914
## 3  0.90265293  0.7794235  1.2985792 -0.070419839
## 
## Clustering vector:
##    [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##   [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##   [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [186] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [223] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 1 3 3 3 3 3 1 3 3 3 3 3 3 3 1
##  [260] 1 1 3 3 3 3 3 3 3 3 1 1 3 3 3 1 3 3 3 1 3 3 1 1 3 1 3 3 3 1 1 1 1 3 3 1 1
##  [297] 1 1 3 3 1 3 3 3 1 3 1 3 1 3 3 1 1 3 3 1 3 3 3 3 1 1 1 3 1 3 3 3 2 1 1 3 3
##  [334] 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2
##  [371] 1 1 2 1 2 2 1 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 1 2 1 1 2 2 2 2 2 2 1 1 2 1 1
##  [408] 2 1 2 1 1 2 2 2 2 1 2 1 2 2 2 1 2 2 1 2 2 2 1 1 1 1 1 2 1 2 2 1 2 1 2 2 1
##  [445] 1 1 1 2 1 2 1 2 2 1 2 1 2 1 2 1 1 2 2 2 2 2 2 2 1 2 1 1 2 1 1 2 2 1 2 2 1
##  [482] 2 2 1 1 2 1 2 2 2 1 2 1 1 1 1 2 2 1 1 1 2 2 1 2 2 2 2 2 2 1 1 1 2 1 2 2 2
##  [519] 1 2 1 2 2 1 2 1 1 1 1 1 2 2 1 2 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1 1
##  [556] 2 2 1 2 1 1 1 1 1 2 2 2 2 2 2 1 2 2 1 2 1 2 2 1 1 2 1 2 1 1 1 1 2 1 2 2 2
##  [593] 1 2 2 2 2 2 2 2 1 2 1 2 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 1 1 2 1 1
##  [630] 1 2 2 2 2 1 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2 2 1 2 1 2 2 1 2 1 1 2 1 1 2 1 2
##  [667] 2 2 2 2 1 2 1 1 1 2 2 1 2 2 1 2 1 2 2 2 2 2 1 2 2 1 2 2 1 1 2 1 1 2 1 1 2
##  [704] 2 2 2 2 1 1 2 1 2 2 1 2 1 1 2 1 1 2 1 1 2 1 2 1 2 2 1 2 1 2 1 2 2 1 1 1 2
##  [741] 1 2 2 1 1 2 2 1 2 1 1 2 2 2 2 1 1 2 1 1 1 2 2 2 1 1 1 1 2 1 1 1 1 1 1 2 2
##  [778] 2 1 2 2 1 2 2 2 1 2 2 1 2 1 2 2 1 1 1 2 1 2 1 2 2 2 2 1 2 2 2 2 1 2 1 1 1
##  [815] 2 2 2 2 1 1 2 2 2 1 1 2 2 2 1 2 1 2 1 1 1 2 2 1 2 2 1 2 2 2 1 1 2 2 2 2 2
##  [852] 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 1 1 1 2 2 2 1 1
##  [889] 1 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 1 1 1 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2
##  [926] 2 2 1 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2
##  [963] 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1000] 2
## 
## Within cluster sum of squares by cluster:
## [1] 597.3201 814.8227 643.9272
##  (between_SS / total_SS =  48.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

This code runs K-means clustering to group the data into 3 clusters and prints the clustering results.

# ADD CLUSTER

Supermarket_Sales$Cluster <- kmeans_model$cluster
head(Supermarket_Sales)

## # A tibble: 6 × 18
##   Invoice.ID  Branch City  Customer.type Gender Product.line Unit.price Quantity
##   <chr>       <chr>  <chr> <chr>         <chr>  <chr>             <dbl>    <dbl>
## 1 860-79-0874 C      Nayp… Member        Female Fashion acc…       99.3       10
## 2 687-47-8271 A      Yang… Normal        Male   Fashion acc…       99.0       10
## 3 283-26-5248 C      Nayp… Member        Female Food and be…       98.5       10
## 4 751-41-9720 C      Nayp… Normal        Male   Home and li…       97.5       10
## 5 303-96-2227 B      Mand… Normal        Female Home and li…       97.4       10
## 6 744-16-7898 B      Mand… Normal        Female Home and li…       97.4       10
## # ℹ 10 more variables: Tax.5. <dbl>, Total <dbl>, Date <chr>, Time <time>,
## #   Payment <chr>, cogs <dbl>, gross.margin.percentage <dbl>,
## #   gross.income <dbl>, Rating <dbl>, Cluster <int>

This code adds the cluster number assigned by the K-means model to the Supermarket_Sales dataset as a new column.

# PLOT

plot(
  scaled_data[, "Unit.price"],
  scaled_data[, "Total"],
  col = kmeans_model$cluster,
  pch = 19,
  main = "K-Means Clustering",
  xlab = "Unit.Price (scaled)",
  ylab = "Total(scaled)"
)

This plot visualizes how K-means has grouped the data into clusters based on scaled Unit Price and Total values.

# REGRESSION 
## SIMPLE LINEAR 
model_simple <- lm(cogs ~ `Unit.price`, data = Supermarket_Sales)

# MULTIPLE LINEAR 

model_simple <- lm(cogs ~ `Unit.price`, data = Supermarket_Sales)

# PREDICTED VALUE

predicted_values <- predict(model_simple, newdata = Supermarket_Sales)
head(predicted_values)

##        1        2        3        4        5        6 
## 552.0502 550.2571 547.6796 541.9641 541.2917 541.2357

PLOTS

SALES BY BRANCH

#Which branch has the highest total sales
branch_sales <- Supermarket_Sales %>%
  group_by(Branch) %>%
  summarise(Total_Sales = sum(Total))

ggplot(branch_sales, aes(x=Branch, y=Total_Sales, fill=Branch)) +
  geom_bar(stat="identity") +
  ggtitle("Total Sales by Branch") +
  theme_minimal()

SALES BY CITY

#Which city contributes the most to total sales
city_sales <- Supermarket_Sales %>%
  group_by(City) %>%
  summarise(Total_Sales = sum(Total))

ggplot(city_sales, aes(x=City, y=Total_Sales, fill=City)) +
  geom_col() +
  ggtitle("Sales by City") +
  theme_minimal()

Descriptive analysis on supermarket sales

Student 1: Anuj singh

Student 2: Suhani yadav

2025-11-14

Introduction

Libraries used

DATASET

View structure

TO CHECK THE COLUMNS NAMES

Data Cleaning

PLOTS

SALES BY GENDER

SALES DISTRIBUTION BY PRODUCT

Confusion Matrix

PLOTS

SALES BY BRANCH

SALES BY CITY