install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("lubridate")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readxl)
data <- read_excel("realistic_shopping_dataset.xlsx")
data
head(data)
str(data)
## tibble [976 × 23] (S3: tbl_df/tbl/data.frame)
##  $ Order_ID       : num [1:976] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Customer_ID    : num [1:976] 1554 1343 1399 1334 1819 ...
##  $ Customer_Name  : chr [1:976] "Customer_0" "Customer_1" "Customer_2" "Customer_3" ...
##  $ Gender         : chr [1:976] "Male" "Female" "Female" "Female" ...
##  $ Age            : num [1:976] 54 40 56 45 62 38 38 62 61 48 ...
##  $ City           : chr [1:976] "Thiruvananthapuram" "Thiruvananthapuram" "Thiruvananthapuram" "Kollam" ...
##  $ Product_ID     : num [1:976] 5658 5359 5234 5504 5308 ...
##  $ Product_Name   : chr [1:976] "Shirt" "Shoes" "Shirt" "Shoes" ...
##  $ Category       : chr [1:976] "Clothing" "Clothing" "clothing" "Clothing" ...
##  $ Brand          : chr [1:976] "Nike" "Adidas" "Nike" "Nike" ...
##  $ Price          : num [1:976] 5510 39669 43897 21010 35117 ...
##  $ Quantity       : num [1:976] 3 1 4 1 2 2 1 2 3 4 ...
##  $ Discount_%     : num [1:976] 3 20 1 3 35 39 6 18 19 22 ...
##  $ Shopping _mode : chr [1:976] "Offline" "Online" "Online" "Offline" ...
##  $ Payment_Method : chr [1:976] "Cash" "Card" "Card" "UPI" ...
##  $ Order_Date     : POSIXct[1:976], format: "2024-10-16" "2024-07-16" ...
##  $ Total_Price    : num [1:976] 16530 39669 175588 21010 70234 ...
##  $ Discount_Amount: num [1:976] 496 7934 1756 630 24582 ...
##  $ Final_Amount   : num [1:976] 16034 31735 173832 20380 45652 ...
##  $ Delivery_Date  : POSIXct[1:976], format: "2024-10-20" "2024-07-20" ...
##  $ Delivery_Status: chr [1:976] "Delivered" "Pending" "Pending" "Cancelled" ...
##  $ Rating         : num [1:976] 1 2 5 4 1 1 3 2 5 4 ...
##  $ Review         : chr [1:976] "Good" "Average" "Good" "Good" ...
#remove missing values
data <-na.omit(data)
data
summary(data)
##     Order_ID      Customer_ID   Customer_Name         Gender         
##  Min.   :  1.0   Min.   :1001   Length:976         Length:976        
##  1st Qu.:244.8   1st Qu.:1258   Class :character   Class :character  
##  Median :488.5   Median :1515   Mode  :character   Mode  :character  
##  Mean   :488.5   Mean   :1509                                        
##  3rd Qu.:732.2   3rd Qu.:1754                                        
##  Max.   :976.0   Max.   :1999                                        
##       Age            City             Product_ID   Product_Name      
##  Min.   :18.00   Length:976         Min.   :5001   Length:976        
##  1st Qu.:29.00   Class :character   1st Qu.:5253   Class :character  
##  Median :41.00   Mode  :character   Median :5502   Mode  :character  
##  Mean   :41.51                      Mean   :5495                     
##  3rd Qu.:54.00                      3rd Qu.:5731                     
##  Max.   :64.00                      Max.   :5999                     
##    Category            Brand               Price          Quantity    
##  Length:976         Length:976         Min.   :  116   Min.   :1.000  
##  Class :character   Class :character   1st Qu.:12473   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Median :24446   Median :3.000  
##                                        Mean   :24445   Mean   :2.497  
##                                        3rd Qu.:36034   3rd Qu.:3.000  
##                                        Max.   :49931   Max.   :4.000  
##    Discount_%    Shopping _mode     Payment_Method    
##  Min.   : 0.00   Length:976         Length:976        
##  1st Qu.:10.00   Class :character   Class :character  
##  Median :19.00   Mode  :character   Mode  :character  
##  Mean   :19.18                                        
##  3rd Qu.:29.00                                        
##  Max.   :39.00                                        
##    Order_Date                   Total_Price     Discount_Amount
##  Min.   :2024-01-01 00:00:00   Min.   :   116   Min.   :    0  
##  1st Qu.:2024-03-25 18:00:00   1st Qu.: 22356   1st Qu.: 2384  
##  Median :2024-07-01 12:00:00   Median : 46965   Median : 7648  
##  Mean   :2024-07-02 01:50:39   Mean   : 61540   Mean   :11788  
##  3rd Qu.:2024-10-08 12:00:00   3rd Qu.: 91045   3rd Qu.:16588  
##  Max.   :2024-12-30 00:00:00   Max.   :199220   Max.   :73646  
##   Final_Amount      Delivery_Date                 Delivery_Status   
##  Min.   :   107.9   Min.   :2024-01-05 00:00:00   Length:976        
##  1st Qu.: 18052.1   1st Qu.:2024-03-30 00:00:00   Class :character  
##  Median : 37593.1   Median :2024-07-06 00:00:00   Mode  :character  
##  Mean   : 49752.6   Mean   :2024-07-07 00:45:44                     
##  3rd Qu.: 73350.1   3rd Qu.:2024-10-13 06:00:00                     
##  Max.   :196424.0   Max.   :2025-01-06 00:00:00                     
##      Rating         Review         
##  Min.   :1.000   Length:976        
##  1st Qu.:2.000   Class :character  
##  Median :3.000   Mode  :character  
##  Mean   :2.967                     
##  3rd Qu.:4.000                     
##  Max.   :5.000
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)
data %>% filter(Age >=20)
data %>% filter(Gender == "Male")
mean(data$Final_Amount)
## [1] 49752.59
mean(data$Price)
## [1] 24444.73
mean(data$`Discount_%`)
## [1] 19.18443
median(data$Discount_Amount)
## [1] 7647.625
max(data$Discount_Amount)
## [1] 73646.28
min(data$Total_Price)
## [1] 116
IQR(data$Price)
## [1] 23561.25
cor(data$Final_Amount,data$`Discount_%`)
## [1] -0.1725435
total_revenue <- sum(data$Final_Amount, na.rm = TRUE)
total_orders <- nrow(data)
avg_order_value <- mean(data$Final_Amount, na.rm = TRUE)

total_revenue
## [1] 48558523
total_orders
## [1] 976
avg_order_value
## [1] 49752.59
#Category Analysis
category_analysis <- data %>%
  group_by(Category) %>%
  summarise(
    Total_Sales = sum(Final_Amount, na.rm = TRUE),
    Orders = n()
  ) %>%
  arrange(desc(Total_Sales))

category_analysis
#Payment Analysis
payment_analysis <- data %>%
  group_by(Payment_Method) %>%
  summarise(
    Revenue = sum(Final_Amount, na.rm = TRUE)
  )

payment_analysis
transaction_data <- data %>%
  count(Payment_Method)

ggplot(transaction_data, aes(x = "", y = "", fill = Payment_Method)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y")

#city analysis
city_analysis <- data %>%
  group_by(City) %>%
  summarise(
    Revenue = sum(Final_Amount, na.rm = TRUE)
  ) %>%
  arrange(desc(Revenue))

city_analysis
#monthly sales revenue
monthly_sales <- data %>%
  mutate(Month = floor_date(Order_Date, "month")) %>%
  group_by(Month) %>%
  summarise(
    Revenue = sum(Final_Amount, na.rm = TRUE)
  )

monthly_sales
#VISUALISATION
#sales by category
ggplot(category_analysis, aes(x = Category, y = Total_Sales)) +
  geom_bar(stat = "identity", fill = "blue") +
  theme_minimal() +
  labs(title = "Sales by Category")

#payment method by revenue
ggplot(payment_analysis, aes(x = Payment_Method, y = Revenue)) +
  geom_bar(stat = "identity", fill = "green") +
  theme_minimal()

#month by revenue
ggplot(monthly_sales, aes(x = Month, y = Revenue)) +
  geom_line(color = "red") +
  geom_point() +
  theme_minimal() +
  labs(title = "Monthly Sales Trend")

#revenue by city
ggplot(city_analysis, aes(x = City, y = Revenue)) +
  geom_bar(stat = "identity", fill = "purple") +
  theme_minimal()

#female male
gender_data <- data %>%
  count(Gender)

ggplot(gender_data, aes(x = "", y = n, fill = Gender)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y")

#total amount and category
category_sales <- data %>%
  group_by(Category) %>%
  summarise(Total = sum(Final_Amount))

ggplot(category_sales, aes(x = Category, y = Total, fill = Category)) +
  geom_bar(stat = "identity") +
  theme_minimal()

#rating by brand
ggplot(data, aes(x = Brand, y = Rating)) +
  geom_boxplot(fill = "orange") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Rating Distribution by Brand")

#no.of customers by amount
library(ggplot2)

ggplot(data, aes(x = Final_Amount)) +
  geom_histogram(binwidth = 5000, fill = "blue") +
  labs(title = "Distribution of Final Amount",
       x = "Final Amount",
       y = "Number of customers") +
  theme_minimal()

#brand preference
ggplot(data, aes(x = Brand, fill = Brand)) +
  geom_bar() +
  labs(title = "Brand Preference")

#brand revenue order
brand_analysis <- data %>%
  group_by(Brand) %>%
  summarise(
    Revenue = sum(Final_Amount, na.rm = TRUE),
    Orders = n()
  ) %>%
  arrange(desc(Revenue))

brand_analysis
#brand vs gender
library(ggplot2)
library(dplyr)

brand_gender <- data %>%
  count(Brand, Gender)

ggplot(brand_gender, aes(x = Brand, y = n, fill = Gender)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Brand vs Gender",
       x = "Brand",
       y = "Count") +
  theme_minimal()

ggplot(data, aes(x = Brand, y = Discount_Amount, fill = Brand)) +
  geom_boxplot() +
  labs(title = "Discount Distribution by Brand",
       y = "Discount (%)") +
  theme_minimal()

ggplot(data, aes(x = Discount_Amount, y = Final_Amount)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Discount vs Spending")
## `geom_smooth()` using formula = 'y ~ x'

#spending by age
ggplot(data, aes(x = Age, y = Final_Amount)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Age vs Spending")
## `geom_smooth()` using formula = 'y ~ x'

library(shiny)