This project analyzes coffee shop sales data obtained from Kaggle. The objective is to explore sales performance, customer demand patterns, payment preferences, and peak operational periods using exploratory data analysis (EDA). The analysis aims to generate actionable insights to support business decision-making in areas such as staffing, product optimization, and revenue planning.
library("plotly")
library(tidyverse)
library(ggplot2)
library(corrplot)
library(gridExtra)
library(dplyr)
library(stringr)
library(scales)
library(lubridate)
coffee_sales_one <-read.csv("C:/Users/respe/Downloads/archive/index_1.csv")
coffee_sales_two <-read.csv("C:/Users/respe/Downloads/archive/index_2.csv")
The data set used in this analysis consists of a merged CSV file sourced from Kaggle. It contains 3,898 coffee sales transactions recorded between March 2024 and early 2025. The data includes variables such as transaction date and time, payment method (cash or card), card information, coffee name, and revenue (money). Transaction revenue values range from approximately 15 to 40 monetary units, with an average transaction value of about 31.38. Transaction times range from 06:00 to 23:00 hours.
The data set was prepared for analysis by ensuring consistent variable formats and combining the available records.
## date datetime cash_type card
## Length:3636 Length:3636 Length:3636 Length:3636
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## money coffee_name
## Min. :18.12 Length:3636
## 1st Qu.:27.92 Class :character
## Median :32.82 Mode :character
## Mean :31.75
## 3rd Qu.:35.76
## Max. :40.00
## date datetime cash_type money
## Length:262 Length:262 Length:262 Min. :15.0
## Class :character Class :character Class :character 1st Qu.:25.0
## Mode :character Mode :character Mode :character Median :27.0
## Mean :26.3
## 3rd Qu.:29.0
## Max. :33.0
## coffee_name
## Length:262
## Class :character
## Mode :character
##
##
##
## [1] "date" "datetime" "cash_type" "card" "money"
## [6] "coffee_name"
## [1] "date" "datetime" "cash_type" "money" "coffee_name"
## date datetime cash_type card money
## 1 2024-03-01 2024-03-01 10:15:50.520 card ANON-0000-0000-0001 38.7
## 2 2024-03-01 2024-03-01 12:19:22.539 card ANON-0000-0000-0002 38.7
## 3 2024-03-01 2024-03-01 12:20:18.089 card ANON-0000-0000-0002 38.7
## 4 2024-03-01 2024-03-01 13:46:33.006 card ANON-0000-0000-0003 28.9
## 5 2024-03-01 2024-03-01 13:48:14.626 card ANON-0000-0000-0004 38.7
## 6 2024-03-01 2024-03-01 15:39:47.726 card ANON-0000-0000-0005 33.8
## 7 2024-03-01 2024-03-01 16:19:02.756 card ANON-0000-0000-0006 38.7
## 8 2024-03-01 2024-03-01 18:39:03.580 card ANON-0000-0000-0007 33.8
## 9 2024-03-01 2024-03-01 19:22:01.762 card ANON-0000-0000-0008 38.7
## 10 2024-03-01 2024-03-01 19:23:15.887 card ANON-0000-0000-0008 33.8
## coffee_name
## 1 Latte
## 2 Hot Chocolate
## 3 Hot Chocolate
## 4 Americano
## 5 Latte
## 6 Americano with Milk
## 7 Hot Chocolate
## 8 Americano with Milk
## 9 Cocoa
## 10 Americano with Milk
## date datetime cash_type money coffee_name
## 1 2025-02-08 2025-02-08 14:26:04 cash 15 Tea
## 2 2025-02-08 2025-02-08 14:28:26 cash 15 Tea
## 3 2025-02-08 2025-02-08 14:33:04 card 20 Espresso
## 4 2025-02-08 2025-02-08 15:51:04 card 30 Chocolate with coffee
## 5 2025-02-08 2025-02-08 16:35:01 cash 27 Chocolate with milk
## 6 2025-02-08 2025-02-08 18:55:30 card 33 Espresso
## 7 2025-02-08 2025-02-08 18:59:03 cash 29 Coffee with Irish whiskey
## 8 2025-02-08 2025-02-08 19:00:28 cash 27 Irish whiskey with milk
## 9 2025-02-09 2025-02-09 08:45:45 card 28 Double Irish whiskey
## 10 2025-02-09 2025-02-09 10:41:36 cash 30 Chocolate with coffee
## date datetime cash_type card money
## 3627 2025-03-22 2025-03-22 15:57:58.183 card ANON-0000-0000-1302 21.06
## 3628 2025-03-22 2025-03-22 16:26:59.766 card ANON-0000-0000-1274 35.76
## 3629 2025-03-22 2025-03-22 17:53:35.942 card ANON-0000-0000-1274 30.86
## 3630 2025-03-22 2025-03-22 19:16:52.727 card ANON-0000-0000-1314 30.86
## 3631 2025-03-23 2025-03-23 10:07:11.278 card ANON-0000-0000-1196 35.76
## 3632 2025-03-23 2025-03-23 10:34:54.894 card ANON-0000-0000-1158 35.76
## 3633 2025-03-23 2025-03-23 14:43:37.362 card ANON-0000-0000-1315 35.76
## 3634 2025-03-23 2025-03-23 14:44:16.864 card ANON-0000-0000-1315 35.76
## 3635 2025-03-23 2025-03-23 15:47:28.723 card ANON-0000-0000-1316 25.96
## 3636 2025-03-23 2025-03-23 18:11:38.635 card ANON-0000-0000-1275 35.76
## coffee_name
## 3627 Espresso
## 3628 Latte
## 3629 Americano with Milk
## 3630 Americano with Milk
## 3631 Latte
## 3632 Cappuccino
## 3633 Cocoa
## 3634 Cocoa
## 3635 Americano
## 3636 Latte
## date datetime cash_type money coffee_name
## 253 2025-03-22 2025-03-22 22:21:21 card 25 Americano with milk
## 254 2025-03-22 2025-03-22 22:23:39 card 29 Coffee with Irish whiskey
## 255 2025-03-23 2025-03-23 10:16:44 cash 25 Irish whiskey
## 256 2025-03-23 2025-03-23 10:25:48 cash 25 Americano with milk
## 257 2025-03-23 2025-03-23 10:25:48 cash 25 Americano with milk
## 258 2025-03-23 2025-03-23 14:55:46 cash 30 Cappuccino
## 259 2025-03-23 2025-03-23 15:15:36 card 25 Irish whiskey
## 260 2025-03-23 2025-03-23 17:59:25 card 28 Super chocolate
## 261 2025-03-23 2025-03-23 18:01:33 card 28 Vanilla with Irish whiskey
## 262 2025-03-23 2025-03-23 21:23:11 card 29 Coffee with Irish whiskey
## 'data.frame': 3636 obs. of 6 variables:
## $ date : chr "2024-03-01" "2024-03-01" "2024-03-01" "2024-03-01" ...
## $ datetime : chr "2024-03-01 10:15:50.520" "2024-03-01 12:19:22.539" "2024-03-01 12:20:18.089" "2024-03-01 13:46:33.006" ...
## $ cash_type : chr "card" "card" "card" "card" ...
## $ card : chr "ANON-0000-0000-0001" "ANON-0000-0000-0002" "ANON-0000-0000-0002" "ANON-0000-0000-0003" ...
## $ money : num 38.7 38.7 38.7 28.9 38.7 33.8 38.7 33.8 38.7 33.8 ...
## $ coffee_name: chr "Latte" "Hot Chocolate" "Hot Chocolate" "Americano" ...
## 'data.frame': 262 obs. of 5 variables:
## $ date : chr "2025-02-08" "2025-02-08" "2025-02-08" "2025-02-08" ...
## $ datetime : chr "2025-02-08 14:26:04" "2025-02-08 14:28:26" "2025-02-08 14:33:04" "2025-02-08 15:51:04" ...
## $ cash_type : chr "cash" "cash" "card" "card" ...
## $ money : num 15 15 20 30 27 33 29 27 28 30 ...
## $ coffee_name: chr "Tea" "Tea" "Espresso" "Chocolate with coffee" ...
## [1] 0
coffee_sales<- rbind(coffee_sales_one,coffee_sales_two)
nrow(coffee_sales)
## [1] 3898
## date datetime cash_type card money
## 1 2024-03-01 2024-03-01 10:15:50.520 card ANON-0000-0000-0001 38.7
## 2 2024-03-01 2024-03-01 12:19:22.539 card ANON-0000-0000-0002 38.7
## 3 2024-03-01 2024-03-01 12:20:18.089 card ANON-0000-0000-0002 38.7
## 4 2024-03-01 2024-03-01 13:46:33.006 card ANON-0000-0000-0003 28.9
## 5 2024-03-01 2024-03-01 13:48:14.626 card ANON-0000-0000-0004 38.7
## 6 2024-03-01 2024-03-01 15:39:47.726 card ANON-0000-0000-0005 33.8
## coffee_name
## 1 Latte
## 2 Hot Chocolate
## 3 Hot Chocolate
## 4 Americano
## 5 Latte
## 6 Americano with Milk
mean(coffee_sales$money, na.rm = TRUE)
## [1] 31.3806
## date datetime cash_type card money
## 3893 2025-03-23 2025-03-23 10:25:48 cash <NA> 25
## 3894 2025-03-23 2025-03-23 14:55:46 cash <NA> 30
## 3895 2025-03-23 2025-03-23 15:15:36 card <NA> 25
## 3896 2025-03-23 2025-03-23 17:59:25 card <NA> 28
## 3897 2025-03-23 2025-03-23 18:01:33 card <NA> 28
## 3898 2025-03-23 2025-03-23 21:23:11 card <NA> 29
## coffee_name
## 3893 Americano with milk
## 3894 Cappuccino
## 3895 Irish whiskey
## 3896 Super chocolate
## 3897 Vanilla with Irish whiskey
## 3898 Coffee with Irish whiskey
## date datetime cash_type card
## Length:3898 Length:3898 Length:3898 Length:3898
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## money coffee_name
## Min. :15.00 Length:3898
## 1st Qu.:27.92 Class :character
## Median :32.82 Mode :character
## Mean :31.38
## 3rd Qu.:35.76
## Max. :40.00
## [1] Friday Friday Friday Friday Friday Friday
## Levels: Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 'data.frame': 3898 obs. of 10 variables:
## $ date : chr "2024-03-01" "2024-03-01" "2024-03-01" "2024-03-01" ...
## $ datetime : POSIXct, format: "2024-03-01 10:15:50" "2024-03-01 12:19:22" ...
## $ cash_type : chr "card" "card" "card" "card" ...
## $ card : chr "ANON-0000-0000-0001" "ANON-0000-0000-0002" "ANON-0000-0000-0002" "ANON-0000-0000-0003" ...
## $ money : num 38.7 38.7 38.7 28.9 38.7 33.8 38.7 33.8 38.7 33.8 ...
## $ coffee_name: chr "Latte" "Hot Chocolate" "Hot Chocolate" "Americano" ...
## $ hour : num 10 12 12 13 13 15 16 18 19 19 ...
## $ day : num 1 1 1 1 1 1 1 1 1 1 ...
## $ month : POSIXct, format: "2024-03-01" "2024-03-01" ...
## $ weekday : Factor w/ 7 levels "Monday","Tuesday",..: 5 5 5 5 5 5 5 5 5 5 ...
## date datetime cash_type card money coffee_name
## 0 0 0 262 0 0
## hour day month weekday
## 0 0 0 0
## [1] 1 1 1 1 1 1
## POSIXct[1:3898], format: "2024-03-01 10:15:50" "2024-03-01 12:19:22" "2024-03-01 12:20:18" ...
## [1] "2024-03-01 +03" "2024-03-01 +03" "2024-03-01 +03" "2024-03-01 +03"
## [5] "2024-03-01 +03" "2024-03-01 +03"
sum(coffee_sales$money, na.rm = TRUE)
## [1] 122321.6
sum(coffee_sales$money)
## [1] 122321.6
sum(monthly_data$revenue)
## [1] 122321.6
sum(weekly_sales$money)
## [1] 122321.6
sum(daily_sales$money)
## [1] 122321.6
## tibble [13 × 3] (S3: tbl_df/tbl/data.frame)
## $ month : POSIXct[1:13], format: "2024-03-01" "2024-04-01" ...
## $ revenue : num [1:13] 7050 6721 9063 7759 6916 ...
## $ transactions: int [1:13] 206 196 267 227 237 272 344 426 259 259 ...
range(daily_sales$money)
## [1] 2412.90 5096.58
A total of 262 missing values were identified in the card column of coffee_sales_two. The main data set does not contain missing values, these missing values are not expected to significantly affect the analysis as the value is not used for subsequent computation. Additionally, product name was standardized by converting all entries to lowercase in other to fix redundancy.
ggplot(coffee_sales, aes(x=cash_type)) +
geom_bar(fill= "lightblue")+theme_classic()+ labs(title = "Payment Method Distribution of Transactions",
x="Payment Method", y= "Number of Transactions")
Card payments account for the majority of transactions, while cash payments represent a relatively small proportion. This suggests that customers predominantly prefer electronic payment methods, which may reflect convenience and modern payment trends.
ggplot(coffee_sales, aes(x=coffee_name))+ geom_bar(fill= "#2C7FB8") + theme_classic(base_size=12) + coord_flip() +theme(axis.text.x = element_text(size=10))
The distribution of coffee products shows that a small number of items account for a large share of total sales. This indicates that customer demand is concentrated on a few popular products, while the remaining items exhibit relatively lower sales frequency.
ggplot(coffee_sales, aes(x = money)) +
geom_histogram(bins = 20, fill = "purple") +
theme_minimal() +
labs(
title = "Distribution of Transaction Revenue",
x = "Revenue Range",
y = "Number of Transactions"
)
The distribution of transaction revenue indicates that most purchases fall within a relatively narrow range, suggesting consistent pricing across products. The absence of extreme values implies that there are no significant outliers in transaction amounts.
ggplot(monthly_data, aes(x = month, y = transactions )) +
geom_line(color = "#2C7FB8", linewidth = 1.2) +
geom_point(color = "darkorange", size = 2) +
scale_y_continuous(limits = c(100, NA))+
theme_minimal(base_size = 12) +
labs(title = "Monthly Transaction Trend",
x = "Months of the Year",
y = "Number of Transactions") +
theme(plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
The monthly transaction trend shows variation in the number of transactions across months. Some months exhibit higher transaction volumes, indicating periods of increased customer activity, while others show relatively lower activity levels.
ggplot(hourly_demand, aes(x = hour, y = demand)) +
geom_line(color = "#1F77B4", linewidth = 0.5) +
geom_point(color= "#1F77B4", size = 1.7)+
geom_text(aes(label = scales::comma(demand)),
vjust = -1.5, angle = 0, color = "#6A5ACD",size = 2)+
theme_classic()+ labs(title = "Hourly Customer Demand",
x= "Hour of the Day", y="Number of Orders")
Customer demand increases during the morning hours and reaches a peak at approximately 10:00. followed by gradual decline with minor fluctuations.
ggplot(hourly_product_demand, aes(x = hour, y = product_demand)) +
geom_line(color= "#1F77B4") + facet_wrap(~ coffee_name)+
theme_classic()+ labs(title = "Hourly Product Demand",
x= "Hour of the Day", y="Number of Orders")
A small number of products (Latte, Americano, Cappuccino, cortado, cocoa, hot chocolate) dominate demand, with Americano with milk recording the highest sales, while most other products display moderate to low or irregular sales patterns.
ggplot(coffee_sales, aes(x = cash_type, fill = cash_type)) +
geom_bar() +
stat_count(aes(label = percent(..count../sum(..count..))),
geom = "text",
vjust = -0.2) +
theme_classic() +
labs(title = "Customers Payment Preferences",
x = "Payment Options",
y = "Sum of Payments Made")
Card payments account for the majority of transactions, while cash payments represent a relatively small proportion.
ggplot(daily_sales, aes(x=day, y= money))+
geom_col(fill= "steelblue")+ theme_classic() +
labs(title= "Sales Distribution by the Day of the Month",
x= "Days of the Month", y="Total Revenue")
Daily revenue fluctuates between approximately 2412.90 and 5096.58. Stronger performance is observed in early and mid-month periods, with a decline toward end of the month.
ggplot(weekly_sales, aes(x=weekday, y= money))+
geom_col (fill = "royalblue" )+ theme_classic() +
labs(title= "Weekly Sales Distribution by the Day of the Week",
x= "Days of the Week", y="Weekly Revenue Generated")
Sales are highest at the beginning of the week, particularly Mondays and Tuesdays, decline midweek particularly on Wednesdays and Thursdays with an increase on Fridays, before dropping during weekends.
ggplot(monthly_data, aes(x = month, y = revenue)) +
geom_col(fill = "#2C7FB8") +
theme_minimal() +
labs(
title = "Revenue Trend by Month",
x = "Month of the year",
y = "Total Revenue"
)
The monthly revenue trend shows fluctuations in total revenue across the observed period. February records noticeably higher revenue, while March and October also show relatively high values, indicating periods of increased customer spending and stronger business performance. In contrast, the remaining months generally exhibit lower and more fluctuating revenue, suggesting an uneven distribution of sales over time.
ggplot(monthly_data, aes(x = transactions, y = revenue)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
theme_minimal() +
labs(
title = "Correlation Between Monthly Transactions and Revenue",
x = "Number of Purchase",
y = "Total Revenue Generated"
)
The scatter plot shows a very strong positive linear relationship between transactions and revenue, indicating that revenue is largely driven by transaction volume, with observations closely clustered around the regression line. Only minor deviations are observed, indicating slight variations in spending per transaction.
The analysis reveals the following key insights: Sales performance is stronger on weekdays compared to weekends. Peak demand occurs at exact 10:00 in the morning, Card payments dominate customer transactions. A small subset of products account for the majority of demand, sales show predictable daily and weekly patterns.
The coffee shop exhibits clear and consistent sales patterns, with peak activity occurring in the morning hours and stronger performance during weekdays. These findings can support decisions related to staffing schedules, inventory management, and product strategy.
R Code for Data Cleaning and Analysis
summary(coffee_sales_one)
summary(coffee_sales_two)
colnames(coffee_sales_one)
colnames(coffee_sales_two)
head(coffee_sales_one, 10)
head(coffee_sales_two, 10)
tail(coffee_sales_one, 10)
tail(coffee_sales_two, 10)
str(coffee_sales_one)
str(coffee_sales_two)
sum(sapply(coffee_sales_one, function(x) sum(is.na(x))))
sum(sapply(coffee_sales_two, function(x) sum(is.na(x))))
coffee_sales_two$card <- NA
coffee_sales <- rbind(coffee_sales_one, coffee_sales_two)
nrow(coffee_sales)
head(coffee_sales)
tail(coffee_sales)
summary(coffee_sales)
coffee_sales$datetime <- as.POSIXct(
coffee_sales$datetime,
format = "%Y-%m-%d %H:%M:%OS"
)
coffee_sales$hour <- as.numeric(format(coffee_sales$datetime, "%H"))
coffee_sales$day <- as.numeric(format(coffee_sales$datetime, "%d"))
coffee_sales$month <- floor_date(coffee_sales$datetime, "month")
coffee_sales$weekday <- factor(
format(coffee_sales$datetime, "%A"),
levels = c("Monday","Tuesday","Wednesday",
"Thursday","Friday","Saturday","Sunday")
)
str(coffee_sales)
colSums(is.na(coffee_sales))
head(coffee_sales$weekday)
head(coffee_sales$day)
head(coffee_sales$month)
coffee_sales$coffee_name <- trimws(tolower(coffee_sales$coffee_name))
hourly_demand <- coffee_sales %>%
count(hour, name = "demand")
hourly_product_demand <- coffee_sales %>%
count(hour, coffee_name, name = "product_demand")
daily_sales <- coffee_sales %>%
group_by(day) %>%
summarise(money = sum(money, na.rm = TRUE)) %>%
arrange(day)
weekly_sales <- coffee_sales %>%
group_by(weekday) %>%
summarise(money = sum(money, na.rm = TRUE))
monthly_data <- coffee_sales %>%
mutate(month = floor_date(datetime, "month")) %>%
group_by(month) %>%
summarise(
revenue = sum(money, na.rm = TRUE),
transactions = n()
)
sum(coffee_sales$money, na.rm = TRUE)
sum(monthly_data$revenue)
sum(weekly_sales$money)
sum(daily_sales$money)
mean(coffee_sales$money, na.rm = TRUE)
range(daily_sales$money)
str(monthly_data)