1 Data loading

library(readr)
data <- read_csv("C:/Users/Fakudze/Desktop/archive (8).zip")
## Rows: 10000 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Transaction ID, Item, Quantity, Price Per Unit, Total Spent, Paymen...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(data)

2 Data examination

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.1
## Warning: package 'purrr' was built under R version 4.5.1
## Warning: package 'forcats' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.1.0
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
str(data)
## spc_tbl_ [10,000 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Transaction ID  : chr [1:10000] "TXN_1961373" "TXN_4977031" "TXN_4271903" "TXN_7034554" ...
##  $ Item            : chr [1:10000] "Coffee" "Cake" "Cookie" "Salad" ...
##  $ Quantity        : chr [1:10000] "2" "4" "4" "2" ...
##  $ Price Per Unit  : chr [1:10000] "2.0" "3.0" "1.0" "5.0" ...
##  $ Total Spent     : chr [1:10000] "4.0" "12.0" "ERROR" "10.0" ...
##  $ Payment Method  : chr [1:10000] "Credit Card" "Cash" "Credit Card" "UNKNOWN" ...
##  $ Location        : chr [1:10000] "Takeaway" "In-store" "In-store" "UNKNOWN" ...
##  $ Transaction Date: chr [1:10000] "2023-09-08" "2023-05-16" "2023-07-19" "2023-04-27" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Transaction ID` = col_character(),
##   ..   Item = col_character(),
##   ..   Quantity = col_character(),
##   ..   `Price Per Unit` = col_character(),
##   ..   `Total Spent` = col_character(),
##   ..   `Payment Method` = col_character(),
##   ..   Location = col_character(),
##   ..   `Transaction Date` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3 Data cleaning

3.1 Converting variables to proper types

library(dplyr)
data <- data %>% 
  rename(price_per_Unit = `Price Per Unit`,
         quantity = Quantity,
         item = Item,
         location = Location,
         transaction_id = `Transaction ID`,
         sales = `Total Spent`,
         payment_Method = `Payment Method`,
         date = `Transaction Date`) %>% 
  mutate(quantity = as.numeric(quantity),
         price_per_Unit = as.numeric(price_per_Unit),
         sales = as.numeric(sales),
         date = parse_date_time(date, orders = c("Ymd", "dmy", "mdy", "ymd")))
## Warning: There were 4 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `quantity = as.numeric(quantity)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

3.2 Extracting time components

data <- data %>% 
  mutate(year = year(date),
         month = month(date, label = T, abbr = F),
         month_number = month(date),
         day = day(date),
         weekday = wday(date, label = T, abbr = F),
         is.weekend = wday(date) %in% c(1, 7))

3.3 Checking For missing values in key variables

colSums(is.na(data))
## transaction_id           item       quantity price_per_Unit          sales 
##              0            333            479            533            502 
## payment_Method       location           date           year          month 
##           2579           3265            460            460            460 
##   month_number            day        weekday     is.weekend 
##            460            460            460              0

3.4 Cleaning missing values

data <- data %>% 
  filter(!is.na(date), !is.na(sales), sales > 0) %>% 
  mutate(payment_Method = replace_na(payment_Method, "Unknown"))

colSums(is.na(data))
## transaction_id           item       quantity price_per_Unit          sales 
##              0            304            434            487              0 
## payment_Method       location           date           year          month 
##              0           2959              0              0              0 
##   month_number            day        weekday     is.weekend 
##              0              0              0              0

3.5 Selecting clean data for analysis

clean_data <- data %>% 
  select(sales, date, year, month, month_number, day, weekday, is.weekend, payment_Method)

4 Data Analysis

4.1 Aggregating sales

sales_summary <- clean_data %>% 
  group_by(year, month, month_number, date, payment_Method, weekday) %>% 
  summarise(
    total_sales = sum(sales, na.rm = T),
    num_transactions = n(),
    avg_transaction = mean(sales, na.rm = T),
    .groups = "drop"
  )

4.2 Sales by year

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
sales_by_year <- sales_summary %>% 
  group_by(year, payment_Method) %>% 
  summarise(total_sales = sum(total_sales),
            num_transactions = sum(num_transactions), .groups = "drop")

ggplot(sales_by_year, aes(factor(year), y = total_sales, fill = payment_Method))+ 
  geom_col()+
  geom_text(aes(label = dollar(total_sales)),
            position = position_stack(vjust = 0.5), size = 3.5)+
  labs(title = "Total sales by year and payment method", y = "Total sale ($)", x = "Year")+
  scale_y_continuous(labels = dollar)+
  theme_minimal()

4.3 Sales by month

sales_by_month <- sales_summary %>% 
  group_by(month, month_number, payment_Method) %>% 
  summarise(total_sales = sum(total_sales), .groups = "drop") %>% 
  arrange(month_number)

ggplot(sales_by_month, aes(x = month, y = total_sales, fill = payment_Method)) +
  geom_col() +
  geom_text(
    aes(label = dollar(round(total_sales))),
        position = position_stack(vjust = 0.5), 
        size = 3.5
  ) +
  labs(
    title = "Total Sales by Month and payment method",
    y = "Total Sales ($)",
    x = "Month",
    fill = "payment method"
  ) +
  scale_y_continuous(labels = dollar) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold")
  )

4.4 Sales by day of the week

library(dplyr)

sales_by_weekday <- sales_summary %>% 
  group_by(weekday, payment_Method) %>% 
  summarise(total_sales = sum(total_sales),
            num_transactions = sum(num_transactions), .groups = "drop")


ggplot(sales_by_weekday, aes(weekday, y = total_sales, fill = payment_Method))+
  geom_col()+
  geom_text(
    aes(label = dollar(round(total_sales))),
    position = position_stack(vjust = 0.5),
    size = 3.5,
    color = "white"
  )+
  scale_fill_brewer(palette = "Set2", name = "Payment Method"
  )+
  scale_y_continuous(labels = dollar, expand = expansion(mult = c(0, 0.1)))+
  labs(
    title = "Total sales by day of the week and payment method",
    subtitle = "Cafe sales data",
    x = "",
    y = "Total sale ($)"
  )+
  theme_minimal(base_size = 14)+
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    plot.title = element_text(face = "bold"),
    legend.position = "right"
)

4.5 Daily sales trend from January 2023 to January 2024

ggplot(sales_summary, aes(x = date, y = total_sales))+
  geom_line(color = "blue", alpha = 0.7)+
  geom_smooth(method = "loess", color = "red")+
  labs(title = "Daily sales trend", y = "Daily sales ($)", x = "Date")+
  scale_y_continuous(labels = dollar)+
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'