1. Introduction

This project analyzes coffee shop sales data obtained from Kaggle. The objective is to explore sales performance, customer demand patterns, payment preferences, and peak operational periods using exploratory data analysis (EDA). The analysis aims to generate actionable insights to support business decision-making in areas such as staffing, product optimization, and revenue planning.

2. Loading Libraries

library("plotly")
library(tidyverse)
library(ggplot2)
library(corrplot)
library(gridExtra)
library(dplyr)
library(stringr)
library(scales)
library(lubridate)

3. Data Importation

coffee_sales_one <-read.csv("C:/Users/respe/Downloads/archive/index_1.csv") 
coffee_sales_two <-read.csv("C:/Users/respe/Downloads/archive/index_2.csv") 

Datasets Source : Kaggle

Name : Coffee Shop Sales

Author : Yaroslav Isaienkov

Date : March, 2024.

4. Data Description

The data set used in this analysis consists of a merged CSV file sourced from Kaggle. It contains 3,898 coffee sales transactions recorded between March 2024 and early 2025. The data includes variables such as transaction date and time, payment method (cash or card), card information, coffee name, and revenue (money). Transaction revenue values range from approximately 15 to 40 monetary units, with an average transaction value of about 31.38. Transaction times range from 06:00 to 23:00 hours.

The data set was prepared for analysis by ensuring consistent variable formats and combining the available records.

5. Data Cleaning and Preparation

##      date             datetime          cash_type             card          
##  Length:3636        Length:3636        Length:3636        Length:3636       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      money       coffee_name       
##  Min.   :18.12   Length:3636       
##  1st Qu.:27.92   Class :character  
##  Median :32.82   Mode  :character  
##  Mean   :31.75                     
##  3rd Qu.:35.76                     
##  Max.   :40.00
##      date             datetime          cash_type             money     
##  Length:262         Length:262         Length:262         Min.   :15.0  
##  Class :character   Class :character   Class :character   1st Qu.:25.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :27.0  
##                                                           Mean   :26.3  
##                                                           3rd Qu.:29.0  
##                                                           Max.   :33.0  
##  coffee_name       
##  Length:262        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
## [1] "date"        "datetime"    "cash_type"   "card"        "money"      
## [6] "coffee_name"
## [1] "date"        "datetime"    "cash_type"   "money"       "coffee_name"
##          date                datetime cash_type                card money
## 1  2024-03-01 2024-03-01 10:15:50.520      card ANON-0000-0000-0001  38.7
## 2  2024-03-01 2024-03-01 12:19:22.539      card ANON-0000-0000-0002  38.7
## 3  2024-03-01 2024-03-01 12:20:18.089      card ANON-0000-0000-0002  38.7
## 4  2024-03-01 2024-03-01 13:46:33.006      card ANON-0000-0000-0003  28.9
## 5  2024-03-01 2024-03-01 13:48:14.626      card ANON-0000-0000-0004  38.7
## 6  2024-03-01 2024-03-01 15:39:47.726      card ANON-0000-0000-0005  33.8
## 7  2024-03-01 2024-03-01 16:19:02.756      card ANON-0000-0000-0006  38.7
## 8  2024-03-01 2024-03-01 18:39:03.580      card ANON-0000-0000-0007  33.8
## 9  2024-03-01 2024-03-01 19:22:01.762      card ANON-0000-0000-0008  38.7
## 10 2024-03-01 2024-03-01 19:23:15.887      card ANON-0000-0000-0008  33.8
##            coffee_name
## 1                Latte
## 2        Hot Chocolate
## 3        Hot Chocolate
## 4            Americano
## 5                Latte
## 6  Americano with Milk
## 7        Hot Chocolate
## 8  Americano with Milk
## 9                Cocoa
## 10 Americano with Milk
##          date            datetime cash_type money               coffee_name
## 1  2025-02-08 2025-02-08 14:26:04      cash    15                       Tea
## 2  2025-02-08 2025-02-08 14:28:26      cash    15                       Tea
## 3  2025-02-08 2025-02-08 14:33:04      card    20                  Espresso
## 4  2025-02-08 2025-02-08 15:51:04      card    30     Chocolate with coffee
## 5  2025-02-08 2025-02-08 16:35:01      cash    27       Chocolate with milk
## 6  2025-02-08 2025-02-08 18:55:30      card    33                  Espresso
## 7  2025-02-08 2025-02-08 18:59:03      cash    29 Coffee with Irish whiskey
## 8  2025-02-08 2025-02-08 19:00:28      cash    27   Irish whiskey with milk
## 9  2025-02-09 2025-02-09 08:45:45      card    28      Double Irish whiskey
## 10 2025-02-09 2025-02-09 10:41:36      cash    30     Chocolate with coffee
##            date                datetime cash_type                card money
## 3627 2025-03-22 2025-03-22 15:57:58.183      card ANON-0000-0000-1302 21.06
## 3628 2025-03-22 2025-03-22 16:26:59.766      card ANON-0000-0000-1274 35.76
## 3629 2025-03-22 2025-03-22 17:53:35.942      card ANON-0000-0000-1274 30.86
## 3630 2025-03-22 2025-03-22 19:16:52.727      card ANON-0000-0000-1314 30.86
## 3631 2025-03-23 2025-03-23 10:07:11.278      card ANON-0000-0000-1196 35.76
## 3632 2025-03-23 2025-03-23 10:34:54.894      card ANON-0000-0000-1158 35.76
## 3633 2025-03-23 2025-03-23 14:43:37.362      card ANON-0000-0000-1315 35.76
## 3634 2025-03-23 2025-03-23 14:44:16.864      card ANON-0000-0000-1315 35.76
## 3635 2025-03-23 2025-03-23 15:47:28.723      card ANON-0000-0000-1316 25.96
## 3636 2025-03-23 2025-03-23 18:11:38.635      card ANON-0000-0000-1275 35.76
##              coffee_name
## 3627            Espresso
## 3628               Latte
## 3629 Americano with Milk
## 3630 Americano with Milk
## 3631               Latte
## 3632          Cappuccino
## 3633               Cocoa
## 3634               Cocoa
## 3635           Americano
## 3636               Latte
##           date            datetime cash_type money                coffee_name
## 253 2025-03-22 2025-03-22 22:21:21      card    25        Americano with milk
## 254 2025-03-22 2025-03-22 22:23:39      card    29  Coffee with Irish whiskey
## 255 2025-03-23 2025-03-23 10:16:44      cash    25              Irish whiskey
## 256 2025-03-23 2025-03-23 10:25:48      cash    25        Americano with milk
## 257 2025-03-23 2025-03-23 10:25:48      cash    25        Americano with milk
## 258 2025-03-23 2025-03-23 14:55:46      cash    30                 Cappuccino
## 259 2025-03-23 2025-03-23 15:15:36      card    25              Irish whiskey
## 260 2025-03-23 2025-03-23 17:59:25      card    28            Super chocolate
## 261 2025-03-23 2025-03-23 18:01:33      card    28 Vanilla with Irish whiskey
## 262 2025-03-23 2025-03-23 21:23:11      card    29  Coffee with Irish whiskey
## 'data.frame':    3636 obs. of  6 variables:
##  $ date       : chr  "2024-03-01" "2024-03-01" "2024-03-01" "2024-03-01" ...
##  $ datetime   : chr  "2024-03-01 10:15:50.520" "2024-03-01 12:19:22.539" "2024-03-01 12:20:18.089" "2024-03-01 13:46:33.006" ...
##  $ cash_type  : chr  "card" "card" "card" "card" ...
##  $ card       : chr  "ANON-0000-0000-0001" "ANON-0000-0000-0002" "ANON-0000-0000-0002" "ANON-0000-0000-0003" ...
##  $ money      : num  38.7 38.7 38.7 28.9 38.7 33.8 38.7 33.8 38.7 33.8 ...
##  $ coffee_name: chr  "Latte" "Hot Chocolate" "Hot Chocolate" "Americano" ...
## 'data.frame':    262 obs. of  5 variables:
##  $ date       : chr  "2025-02-08" "2025-02-08" "2025-02-08" "2025-02-08" ...
##  $ datetime   : chr  "2025-02-08 14:26:04" "2025-02-08 14:28:26" "2025-02-08 14:33:04" "2025-02-08 15:51:04" ...
##  $ cash_type  : chr  "cash" "cash" "card" "card" ...
##  $ money      : num  15 15 20 30 27 33 29 27 28 30 ...
##  $ coffee_name: chr  "Tea" "Tea" "Espresso" "Chocolate with coffee" ...
## [1] 0
coffee_sales<- rbind(coffee_sales_one,coffee_sales_two) 
nrow(coffee_sales)
## [1] 3898
##         date                datetime cash_type                card money
## 1 2024-03-01 2024-03-01 10:15:50.520      card ANON-0000-0000-0001  38.7
## 2 2024-03-01 2024-03-01 12:19:22.539      card ANON-0000-0000-0002  38.7
## 3 2024-03-01 2024-03-01 12:20:18.089      card ANON-0000-0000-0002  38.7
## 4 2024-03-01 2024-03-01 13:46:33.006      card ANON-0000-0000-0003  28.9
## 5 2024-03-01 2024-03-01 13:48:14.626      card ANON-0000-0000-0004  38.7
## 6 2024-03-01 2024-03-01 15:39:47.726      card ANON-0000-0000-0005  33.8
##           coffee_name
## 1               Latte
## 2       Hot Chocolate
## 3       Hot Chocolate
## 4           Americano
## 5               Latte
## 6 Americano with Milk
mean(coffee_sales$money, na.rm = TRUE)
## [1] 31.3806
##            date            datetime cash_type card money
## 3893 2025-03-23 2025-03-23 10:25:48      cash <NA>    25
## 3894 2025-03-23 2025-03-23 14:55:46      cash <NA>    30
## 3895 2025-03-23 2025-03-23 15:15:36      card <NA>    25
## 3896 2025-03-23 2025-03-23 17:59:25      card <NA>    28
## 3897 2025-03-23 2025-03-23 18:01:33      card <NA>    28
## 3898 2025-03-23 2025-03-23 21:23:11      card <NA>    29
##                     coffee_name
## 3893        Americano with milk
## 3894                 Cappuccino
## 3895              Irish whiskey
## 3896            Super chocolate
## 3897 Vanilla with Irish whiskey
## 3898  Coffee with Irish whiskey
##      date             datetime          cash_type             card          
##  Length:3898        Length:3898        Length:3898        Length:3898       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      money       coffee_name       
##  Min.   :15.00   Length:3898       
##  1st Qu.:27.92   Class :character  
##  Median :32.82   Mode  :character  
##  Mean   :31.38                     
##  3rd Qu.:35.76                     
##  Max.   :40.00
## [1] Friday Friday Friday Friday Friday Friday
## Levels: Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 'data.frame':    3898 obs. of  10 variables:
##  $ date       : chr  "2024-03-01" "2024-03-01" "2024-03-01" "2024-03-01" ...
##  $ datetime   : POSIXct, format: "2024-03-01 10:15:50" "2024-03-01 12:19:22" ...
##  $ cash_type  : chr  "card" "card" "card" "card" ...
##  $ card       : chr  "ANON-0000-0000-0001" "ANON-0000-0000-0002" "ANON-0000-0000-0002" "ANON-0000-0000-0003" ...
##  $ money      : num  38.7 38.7 38.7 28.9 38.7 33.8 38.7 33.8 38.7 33.8 ...
##  $ coffee_name: chr  "Latte" "Hot Chocolate" "Hot Chocolate" "Americano" ...
##  $ hour       : num  10 12 12 13 13 15 16 18 19 19 ...
##  $ day        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ month      : POSIXct, format: "2024-03-01" "2024-03-01" ...
##  $ weekday    : Factor w/ 7 levels "Monday","Tuesday",..: 5 5 5 5 5 5 5 5 5 5 ...
##        date    datetime   cash_type        card       money coffee_name 
##           0           0           0         262           0           0 
##        hour         day       month     weekday 
##           0           0           0           0
## [1] 1 1 1 1 1 1
##  POSIXct[1:3898], format: "2024-03-01 10:15:50" "2024-03-01 12:19:22" "2024-03-01 12:20:18" ...
## [1] "2024-03-01 +03" "2024-03-01 +03" "2024-03-01 +03" "2024-03-01 +03"
## [5] "2024-03-01 +03" "2024-03-01 +03"
sum(coffee_sales$money, na.rm = TRUE)
## [1] 122321.6
sum(coffee_sales$money)
## [1] 122321.6
sum(monthly_data$revenue)
## [1] 122321.6
sum(weekly_sales$money)
## [1] 122321.6
sum(daily_sales$money)
## [1] 122321.6
## tibble [13 × 3] (S3: tbl_df/tbl/data.frame)
##  $ month       : POSIXct[1:13], format: "2024-03-01" "2024-04-01" ...
##  $ revenue     : num [1:13] 7050 6721 9063 7759 6916 ...
##  $ transactions: int [1:13] 206 196 267 227 237 272 344 426 259 259 ...
range(daily_sales$money)
## [1] 2412.90 5096.58

A total of 262 missing values were identified in the card column of coffee_sales_two. The main data set does not contain missing values, these missing values are not expected to significantly affect the analysis as the value is not used for subsequent computation. Additionally, product name was standardized by converting all entries to lowercase in other to fix redundancy.

6. Descriptive Statistics

6.1 Payment method (card and cash)

ggplot(coffee_sales, aes(x=cash_type)) + 
  geom_bar(fill= "lightblue")+theme_classic()+ labs(title = "Payment Method Distribution of Transactions", 
                                   x="Payment Method", y= "Number of Transactions")

Interpretation:

Card payments account for the majority of transactions, while cash payments represent a relatively small proportion. This suggests that customers predominantly prefer electronic payment methods, which may reflect convenience and modern payment trends.

6.2 Product type

ggplot(coffee_sales, aes(x=coffee_name))+ geom_bar(fill= "#2C7FB8") + theme_classic(base_size=12) + coord_flip() +theme(axis.text.x = element_text(size=10))

Interpritation:

The distribution of coffee products shows that a small number of items account for a large share of total sales. This indicates that customer demand is concentrated on a few popular products, while the remaining items exhibit relatively lower sales frequency.

6.3 Distribution of Transaction by Revenue

ggplot(coffee_sales, aes(x = money)) +
  geom_histogram(bins = 20, fill = "purple") +
  theme_minimal() +
  labs(
    title = "Distribution of Transaction Revenue",
    x = "Revenue Range",
    y = "Number of Transactions"
  )

Interpretation:

The distribution of transaction revenue indicates that most purchases fall within a relatively narrow range, suggesting consistent pricing across products. The absence of extreme values implies that there are no significant outliers in transaction amounts.

6.4 Monthly Transaction Trend

ggplot(monthly_data, aes(x = month, y = transactions )) +
  geom_line(color = "#2C7FB8", linewidth = 1.2) +
  geom_point(color = "darkorange", size = 2) +
  scale_y_continuous(limits = c(100, NA))+
  theme_minimal(base_size = 12) +
  labs(title = "Monthly Transaction Trend",
    x = "Months of the Year",
    y = "Number of Transactions") + 
  theme(plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1))

Interpretation:

The monthly transaction trend shows variation in the number of transactions across months. Some months exhibit higher transaction volumes, indicating periods of increased customer activity, while others show relatively lower activity levels.

7. Exploratory Data Analysis

7.1 Hourly Demand Analysis to determine the peak period

ggplot(hourly_demand, aes(x = hour, y = demand)) +
  geom_line(color = "#1F77B4", linewidth = 0.5) +
  geom_point(color= "#1F77B4", size = 1.7)+ 
  geom_text(aes(label = scales::comma(demand)),
  vjust = -1.5, angle = 0, color = "#6A5ACD",size = 2)+
  theme_classic()+ labs(title = "Hourly Customer Demand",
                        x= "Hour of the Day", y="Number of Orders") 

Interpretation:

Customer demand increases during the morning hours and reaches a peak at approximately 10:00. followed by gradual decline with minor fluctuations.

7.2 Product Demand Analysis

ggplot(hourly_product_demand, aes(x = hour, y = product_demand)) +
  geom_line(color= "#1F77B4") + facet_wrap(~ coffee_name)+ 
  theme_classic()+ labs(title = "Hourly Product Demand",                    
                        x= "Hour of the Day", y="Number of Orders") 

Interpretation:

A small number of products (Latte, Americano, Cappuccino, cortado, cocoa, hot chocolate) dominate demand, with Americano with milk recording the highest sales, while most other products display moderate to low or irregular sales patterns.

7.3 Payment Method preferences

ggplot(coffee_sales, aes(x = cash_type, fill = cash_type)) +
  geom_bar() +
  stat_count(aes(label = percent(..count../sum(..count..))),
             geom = "text",
             vjust = -0.2) +
  theme_classic() +
  labs(title = "Customers Payment Preferences",
       x = "Payment Options",
       y = "Sum of Payments Made")

Interpretation:

Card payments account for the majority of transactions, while cash payments represent a relatively small proportion.

7.4 Daily variation of Coffee Sale Revenue

ggplot(daily_sales, aes(x=day, y= money))+
  geom_col(fill= "steelblue")+ theme_classic() + 
  labs(title= "Sales Distribution by the Day of the Month", 
       x= "Days of the Month", y="Total Revenue")

Interpretation:

Daily revenue fluctuates between approximately 2412.90 and 5096.58. Stronger performance is observed in early and mid-month periods, with a decline toward end of the month.

7.5 Weekly variation of Coffee Sale Revenue

ggplot(weekly_sales, aes(x=weekday, y= money))+ 
  geom_col (fill = "royalblue" )+ theme_classic() + 
  labs(title= "Weekly Sales Distribution by the Day of the Week", 
       x= "Days of the Week", y="Weekly Revenue Generated")

Interpretation:

Sales are highest at the beginning of the week, particularly Mondays and Tuesdays, decline midweek particularly on Wednesdays and Thursdays with an increase on Fridays, before dropping during weekends.

7.6.1 Monthly variation of Coffee Sale Revenue

ggplot(monthly_data, aes(x = month, y = revenue)) +
  geom_col(fill = "#2C7FB8") +
  theme_minimal() +
  labs(
    title = "Revenue Trend by Month",
    x = "Month of the year",
    y = "Total Revenue"
  )

Interpretation:

The monthly revenue trend shows fluctuations in total revenue across the observed period. February records noticeably higher revenue, while March and October also show relatively high values, indicating periods of increased customer spending and stronger business performance. In contrast, the remaining months generally exhibit lower and more fluctuating revenue, suggesting an uneven distribution of sales over time.

7.6.2 correlation Between Sales Revenue and Purchase

ggplot(monthly_data, aes(x = transactions, y = revenue)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  theme_minimal() +
  labs(
    title = "Correlation Between Monthly Transactions and Revenue",
    x = "Number of Purchase",
    y = "Total Revenue Generated"
  )

Interpretation:

The scatter plot shows a very strong positive linear relationship between transactions and revenue, indicating that revenue is largely driven by transaction volume, with observations closely clustered around the regression line. Only minor deviations are observed, indicating slight variations in spending per transaction.

8. Overall Finding

The analysis reveals the following key insights: Sales performance is stronger on weekdays compared to weekends. Peak demand occurs at exact 10:00 in the morning, Card payments dominate customer transactions. A small subset of products account for the majority of demand, sales show predictable daily and weekly patterns.

9. Conclusion

The coffee shop exhibits clear and consistent sales patterns, with peak activity occurring in the morning hours and stronger performance during weekdays. These findings can support decisions related to staffing schedules, inventory management, and product strategy.

10. Appendix

R Code for Data Cleaning and Analysis

summary(coffee_sales_one) 
summary(coffee_sales_two) 
colnames(coffee_sales_one) 
colnames(coffee_sales_two)
head(coffee_sales_one, 10) 
head(coffee_sales_two, 10) 
tail(coffee_sales_one, 10) 
tail(coffee_sales_two, 10) 
str(coffee_sales_one) 
str(coffee_sales_two) 

sum(sapply(coffee_sales_one, function(x) sum(is.na(x)))) 
sum(sapply(coffee_sales_two, function(x) sum(is.na(x)))) 

coffee_sales_two$card <- NA

coffee_sales <- rbind(coffee_sales_one, coffee_sales_two)

nrow(coffee_sales)
head(coffee_sales)
tail(coffee_sales)
summary(coffee_sales)

coffee_sales$datetime <- as.POSIXct(
  coffee_sales$datetime, 
  format = "%Y-%m-%d %H:%M:%OS"
)

coffee_sales$hour <- as.numeric(format(coffee_sales$datetime, "%H"))

coffee_sales$day <- as.numeric(format(coffee_sales$datetime, "%d"))

coffee_sales$month <- floor_date(coffee_sales$datetime, "month")

coffee_sales$weekday <- factor(
  format(coffee_sales$datetime, "%A"),
  levels = c("Monday","Tuesday","Wednesday",
             "Thursday","Friday","Saturday","Sunday")
)

str(coffee_sales)
colSums(is.na(coffee_sales))
head(coffee_sales$weekday)
head(coffee_sales$day)
head(coffee_sales$month)

coffee_sales$coffee_name <- trimws(tolower(coffee_sales$coffee_name))

hourly_demand <- coffee_sales %>%
  count(hour, name = "demand")

hourly_product_demand <- coffee_sales %>%
  count(hour, coffee_name, name = "product_demand")

daily_sales <- coffee_sales %>%
  group_by(day) %>%
  summarise(money = sum(money, na.rm = TRUE)) %>%
  arrange(day)

weekly_sales <- coffee_sales %>%
  group_by(weekday) %>%
  summarise(money = sum(money, na.rm = TRUE))

monthly_data <- coffee_sales %>%
  mutate(month = floor_date(datetime, "month")) %>%
  group_by(month) %>%
  summarise(
    revenue = sum(money, na.rm = TRUE),
    transactions = n()
  )

sum(coffee_sales$money, na.rm = TRUE)
sum(monthly_data$revenue)
sum(weekly_sales$money)
sum(daily_sales$money)

mean(coffee_sales$money, na.rm = TRUE)

range(daily_sales$money)

str(monthly_data)