Data loading
library(readr)
data <- read_csv("C:/Users/Fakudze/Desktop/archive (8).zip")
## Rows: 10000 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Transaction ID, Item, Quantity, Price Per Unit, Total Spent, Paymen...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data examination
## Warning: package 'tidyverse' was built under R version 4.5.1
## Warning: package 'purrr' was built under R version 4.5.1
## Warning: package 'forcats' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.1.0
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## spc_tbl_ [10,000 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Transaction ID : chr [1:10000] "TXN_1961373" "TXN_4977031" "TXN_4271903" "TXN_7034554" ...
## $ Item : chr [1:10000] "Coffee" "Cake" "Cookie" "Salad" ...
## $ Quantity : chr [1:10000] "2" "4" "4" "2" ...
## $ Price Per Unit : chr [1:10000] "2.0" "3.0" "1.0" "5.0" ...
## $ Total Spent : chr [1:10000] "4.0" "12.0" "ERROR" "10.0" ...
## $ Payment Method : chr [1:10000] "Credit Card" "Cash" "Credit Card" "UNKNOWN" ...
## $ Location : chr [1:10000] "Takeaway" "In-store" "In-store" "UNKNOWN" ...
## $ Transaction Date: chr [1:10000] "2023-09-08" "2023-05-16" "2023-07-19" "2023-04-27" ...
## - attr(*, "spec")=
## .. cols(
## .. `Transaction ID` = col_character(),
## .. Item = col_character(),
## .. Quantity = col_character(),
## .. `Price Per Unit` = col_character(),
## .. `Total Spent` = col_character(),
## .. `Payment Method` = col_character(),
## .. Location = col_character(),
## .. `Transaction Date` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Data cleaning
Converting variables
to proper types
library(dplyr)
data <- data %>%
rename(price_per_Unit = `Price Per Unit`,
quantity = Quantity,
item = Item,
location = Location,
transaction_id = `Transaction ID`,
sales = `Total Spent`,
payment_Method = `Payment Method`,
date = `Transaction Date`) %>%
mutate(quantity = as.numeric(quantity),
price_per_Unit = as.numeric(price_per_Unit),
sales = as.numeric(sales),
date = parse_date_time(date, orders = c("Ymd", "dmy", "mdy", "ymd")))
## Warning: There were 4 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `quantity = as.numeric(quantity)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.
Checking For missing
values in key variables
## transaction_id item quantity price_per_Unit sales
## 0 333 479 533 502
## payment_Method location date year month
## 2579 3265 460 460 460
## month_number day weekday is.weekend
## 460 460 460 0
Cleaning missing
values
data <- data %>%
filter(!is.na(date), !is.na(sales), sales > 0) %>%
mutate(payment_Method = replace_na(payment_Method, "Unknown"))
colSums(is.na(data))
## transaction_id item quantity price_per_Unit sales
## 0 304 434 487 0
## payment_Method location date year month
## 0 2959 0 0 0
## month_number day weekday is.weekend
## 0 0 0 0
Selecting clean data
for analysis
clean_data <- data %>%
select(sales, date, year, month, month_number, day, weekday, is.weekend, payment_Method)
Data Analysis
Aggregating
sales
sales_summary <- clean_data %>%
group_by(year, month, month_number, date, payment_Method, weekday) %>%
summarise(
total_sales = sum(sales, na.rm = T),
num_transactions = n(),
avg_transaction = mean(sales, na.rm = T),
.groups = "drop"
)
Sales by year
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
sales_by_year <- sales_summary %>%
group_by(year, payment_Method) %>%
summarise(total_sales = sum(total_sales),
num_transactions = sum(num_transactions), .groups = "drop")
ggplot(sales_by_year, aes(factor(year), y = total_sales, fill = payment_Method))+
geom_col()+
geom_text(aes(label = dollar(total_sales)),
position = position_stack(vjust = 0.5), size = 3.5)+
labs(title = "Total sales by year and payment method", y = "Total sale ($)", x = "Year")+
scale_y_continuous(labels = dollar)+
theme_minimal()

Sales by month
sales_by_month <- sales_summary %>%
group_by(month, month_number, payment_Method) %>%
summarise(total_sales = sum(total_sales), .groups = "drop") %>%
arrange(month_number)
ggplot(sales_by_month, aes(x = month, y = total_sales, fill = payment_Method)) +
geom_col() +
geom_text(
aes(label = dollar(round(total_sales))),
position = position_stack(vjust = 0.5),
size = 3.5
) +
labs(
title = "Total Sales by Month and payment method",
y = "Total Sales ($)",
x = "Month",
fill = "payment method"
) +
scale_y_continuous(labels = dollar) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold")
)

Sales by day of the
week
library(dplyr)
sales_by_weekday <- sales_summary %>%
group_by(weekday, payment_Method) %>%
summarise(total_sales = sum(total_sales),
num_transactions = sum(num_transactions), .groups = "drop")
ggplot(sales_by_weekday, aes(weekday, y = total_sales, fill = payment_Method))+
geom_col()+
geom_text(
aes(label = dollar(round(total_sales))),
position = position_stack(vjust = 0.5),
size = 3.5,
color = "white"
)+
scale_fill_brewer(palette = "Set2", name = "Payment Method"
)+
scale_y_continuous(labels = dollar, expand = expansion(mult = c(0, 0.1)))+
labs(
title = "Total sales by day of the week and payment method",
subtitle = "Cafe sales data",
x = "",
y = "Total sale ($)"
)+
theme_minimal(base_size = 14)+
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
plot.title = element_text(face = "bold"),
legend.position = "right"
)

Daily sales trend
from January 2023 to January 2024
ggplot(sales_summary, aes(x = date, y = total_sales))+
geom_line(color = "blue", alpha = 0.7)+
geom_smooth(method = "loess", color = "red")+
labs(title = "Daily sales trend", y = "Daily sales ($)", x = "Date")+
scale_y_continuous(labels = dollar)+
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
