install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("lubridate")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readxl)
data <- read_excel("realistic_shopping_dataset.xlsx")
data
head(data)
str(data)
## tibble [976 × 23] (S3: tbl_df/tbl/data.frame)
## $ Order_ID : num [1:976] 1 2 3 4 5 6 7 8 9 10 ...
## $ Customer_ID : num [1:976] 1554 1343 1399 1334 1819 ...
## $ Customer_Name : chr [1:976] "Customer_0" "Customer_1" "Customer_2" "Customer_3" ...
## $ Gender : chr [1:976] "Male" "Female" "Female" "Female" ...
## $ Age : num [1:976] 54 40 56 45 62 38 38 62 61 48 ...
## $ City : chr [1:976] "Thiruvananthapuram" "Thiruvananthapuram" "Thiruvananthapuram" "Kollam" ...
## $ Product_ID : num [1:976] 5658 5359 5234 5504 5308 ...
## $ Product_Name : chr [1:976] "Shirt" "Shoes" "Shirt" "Shoes" ...
## $ Category : chr [1:976] "Clothing" "Clothing" "clothing" "Clothing" ...
## $ Brand : chr [1:976] "Nike" "Adidas" "Nike" "Nike" ...
## $ Price : num [1:976] 5510 39669 43897 21010 35117 ...
## $ Quantity : num [1:976] 3 1 4 1 2 2 1 2 3 4 ...
## $ Discount_% : num [1:976] 3 20 1 3 35 39 6 18 19 22 ...
## $ Shopping _mode : chr [1:976] "Offline" "Online" "Online" "Offline" ...
## $ Payment_Method : chr [1:976] "Cash" "Card" "Card" "UPI" ...
## $ Order_Date : POSIXct[1:976], format: "2024-10-16" "2024-07-16" ...
## $ Total_Price : num [1:976] 16530 39669 175588 21010 70234 ...
## $ Discount_Amount: num [1:976] 496 7934 1756 630 24582 ...
## $ Final_Amount : num [1:976] 16034 31735 173832 20380 45652 ...
## $ Delivery_Date : POSIXct[1:976], format: "2024-10-20" "2024-07-20" ...
## $ Delivery_Status: chr [1:976] "Delivered" "Pending" "Pending" "Cancelled" ...
## $ Rating : num [1:976] 1 2 5 4 1 1 3 2 5 4 ...
## $ Review : chr [1:976] "Good" "Average" "Good" "Good" ...
#remove missing values
data <-na.omit(data)
data
summary(data)
## Order_ID Customer_ID Customer_Name Gender
## Min. : 1.0 Min. :1001 Length:976 Length:976
## 1st Qu.:244.8 1st Qu.:1258 Class :character Class :character
## Median :488.5 Median :1515 Mode :character Mode :character
## Mean :488.5 Mean :1509
## 3rd Qu.:732.2 3rd Qu.:1754
## Max. :976.0 Max. :1999
## Age City Product_ID Product_Name
## Min. :18.00 Length:976 Min. :5001 Length:976
## 1st Qu.:29.00 Class :character 1st Qu.:5253 Class :character
## Median :41.00 Mode :character Median :5502 Mode :character
## Mean :41.51 Mean :5495
## 3rd Qu.:54.00 3rd Qu.:5731
## Max. :64.00 Max. :5999
## Category Brand Price Quantity
## Length:976 Length:976 Min. : 116 Min. :1.000
## Class :character Class :character 1st Qu.:12473 1st Qu.:1.000
## Mode :character Mode :character Median :24446 Median :3.000
## Mean :24445 Mean :2.497
## 3rd Qu.:36034 3rd Qu.:3.000
## Max. :49931 Max. :4.000
## Discount_% Shopping _mode Payment_Method
## Min. : 0.00 Length:976 Length:976
## 1st Qu.:10.00 Class :character Class :character
## Median :19.00 Mode :character Mode :character
## Mean :19.18
## 3rd Qu.:29.00
## Max. :39.00
## Order_Date Total_Price Discount_Amount
## Min. :2024-01-01 00:00:00 Min. : 116 Min. : 0
## 1st Qu.:2024-03-25 18:00:00 1st Qu.: 22356 1st Qu.: 2384
## Median :2024-07-01 12:00:00 Median : 46965 Median : 7648
## Mean :2024-07-02 01:50:39 Mean : 61540 Mean :11788
## 3rd Qu.:2024-10-08 12:00:00 3rd Qu.: 91045 3rd Qu.:16588
## Max. :2024-12-30 00:00:00 Max. :199220 Max. :73646
## Final_Amount Delivery_Date Delivery_Status
## Min. : 107.9 Min. :2024-01-05 00:00:00 Length:976
## 1st Qu.: 18052.1 1st Qu.:2024-03-30 00:00:00 Class :character
## Median : 37593.1 Median :2024-07-06 00:00:00 Mode :character
## Mean : 49752.6 Mean :2024-07-07 00:45:44
## 3rd Qu.: 73350.1 3rd Qu.:2024-10-13 06:00:00
## Max. :196424.0 Max. :2025-01-06 00:00:00
## Rating Review
## Min. :1.000 Length:976
## 1st Qu.:2.000 Class :character
## Median :3.000 Mode :character
## Mean :2.967
## 3rd Qu.:4.000
## Max. :5.000
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)
data %>% filter(Age >=20)
data %>% filter(Gender == "Male")
mean(data$Final_Amount)
## [1] 49752.59
mean(data$Price)
## [1] 24444.73
mean(data$`Discount_%`)
## [1] 19.18443
median(data$Discount_Amount)
## [1] 7647.625
max(data$Discount_Amount)
## [1] 73646.28
min(data$Total_Price)
## [1] 116
IQR(data$Price)
## [1] 23561.25
cor(data$Final_Amount,data$`Discount_%`)
## [1] -0.1725435
total_revenue <- sum(data$Final_Amount, na.rm = TRUE)
total_orders <- nrow(data)
avg_order_value <- mean(data$Final_Amount, na.rm = TRUE)
total_revenue
## [1] 48558523
total_orders
## [1] 976
avg_order_value
## [1] 49752.59
#Category Analysis
category_analysis <- data %>%
group_by(Category) %>%
summarise(
Total_Sales = sum(Final_Amount, na.rm = TRUE),
Orders = n()
) %>%
arrange(desc(Total_Sales))
category_analysis
#Payment Analysis
payment_analysis <- data %>%
group_by(Payment_Method) %>%
summarise(
Revenue = sum(Final_Amount, na.rm = TRUE)
)
payment_analysis
transaction_data <- data %>%
count(Payment_Method)
ggplot(transaction_data, aes(x = "", y = "", fill = Payment_Method)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y")

#city analysis
city_analysis <- data %>%
group_by(City) %>%
summarise(
Revenue = sum(Final_Amount, na.rm = TRUE)
) %>%
arrange(desc(Revenue))
city_analysis
#monthly sales revenue
monthly_sales <- data %>%
mutate(Month = floor_date(Order_Date, "month")) %>%
group_by(Month) %>%
summarise(
Revenue = sum(Final_Amount, na.rm = TRUE)
)
monthly_sales
#VISUALISATION
#sales by category
ggplot(category_analysis, aes(x = Category, y = Total_Sales)) +
geom_bar(stat = "identity", fill = "blue") +
theme_minimal() +
labs(title = "Sales by Category")

#payment method by revenue
ggplot(payment_analysis, aes(x = Payment_Method, y = Revenue)) +
geom_bar(stat = "identity", fill = "green") +
theme_minimal()

#month by revenue
ggplot(monthly_sales, aes(x = Month, y = Revenue)) +
geom_line(color = "red") +
geom_point() +
theme_minimal() +
labs(title = "Monthly Sales Trend")

#revenue by city
ggplot(city_analysis, aes(x = City, y = Revenue)) +
geom_bar(stat = "identity", fill = "purple") +
theme_minimal()

#female male
gender_data <- data %>%
count(Gender)
ggplot(gender_data, aes(x = "", y = n, fill = Gender)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y")

#total amount and category
category_sales <- data %>%
group_by(Category) %>%
summarise(Total = sum(Final_Amount))
ggplot(category_sales, aes(x = Category, y = Total, fill = Category)) +
geom_bar(stat = "identity") +
theme_minimal()

#rating by brand
ggplot(data, aes(x = Brand, y = Rating)) +
geom_boxplot(fill = "orange") +
coord_flip() +
theme_minimal() +
labs(title = "Rating Distribution by Brand")

#no.of customers by amount
library(ggplot2)
ggplot(data, aes(x = Final_Amount)) +
geom_histogram(binwidth = 5000, fill = "blue") +
labs(title = "Distribution of Final Amount",
x = "Final Amount",
y = "Number of customers") +
theme_minimal()

#brand preference
ggplot(data, aes(x = Brand, fill = Brand)) +
geom_bar() +
labs(title = "Brand Preference")

#brand revenue order
brand_analysis <- data %>%
group_by(Brand) %>%
summarise(
Revenue = sum(Final_Amount, na.rm = TRUE),
Orders = n()
) %>%
arrange(desc(Revenue))
brand_analysis
#brand vs gender
library(ggplot2)
library(dplyr)
brand_gender <- data %>%
count(Brand, Gender)
ggplot(brand_gender, aes(x = Brand, y = n, fill = Gender)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Brand vs Gender",
x = "Brand",
y = "Count") +
theme_minimal()

ggplot(data, aes(x = Brand, y = Discount_Amount, fill = Brand)) +
geom_boxplot() +
labs(title = "Discount Distribution by Brand",
y = "Discount (%)") +
theme_minimal()

ggplot(data, aes(x = Discount_Amount, y = Final_Amount)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Discount vs Spending")
## `geom_smooth()` using formula = 'y ~ x'

#spending by age
ggplot(data, aes(x = Age, y = Final_Amount)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Age vs Spending")
## `geom_smooth()` using formula = 'y ~ x'

library(shiny)