This report documents the full ETL (Extract, Transform, Load) process for building a data warehouse for an online retail company. It includes schema creation, data cleaning, and loading the processed data into a SQLite database. Visualizations and summary statistics are also provided.
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(DBI)
library(RSQLite)
library(ggplot2)
# Define file path
file_path <- "/Users/calebazonsi/Downloads/ANA 515 Week 8 Activity.xlsx"
# Read Excel sheets
sales <- read_excel(file_path, sheet = "sales")
customers <- read_excel(file_path, sheet = "customers")
products <- read_excel(file_path, sheet = "products")
# Standardize column names
names(sales) <- tolower(names(sales))
names(customers) <- tolower(names(customers))
names(products) <- tolower(names(products))
# Remove duplicates
sales <- distinct(sales)
customers <- distinct(customers)
products <- distinct(products)
# Convert signup_date to Date format
customers$signup_date <- as.Date(customers$signup_date)
# Create date_dim table from date_id
date_dim <- sales %>%
select(date_id) %>%
distinct() %>%
arrange(date_id) %>%
mutate(full_date = seq.Date(from = as.Date("2023-01-01"), by = "day", length.out = n()))
# Connect to SQLite
con <- dbConnect(SQLite(), dbname = "retail_dw.sqlite")
# Drop tables if re-running
dbExecute(con, "DROP TABLE IF EXISTS sales_fact")
## [1] 0
dbExecute(con, "DROP TABLE IF EXISTS customer_dim")
## [1] 0
dbExecute(con, "DROP TABLE IF EXISTS product_dim")
## [1] 0
dbExecute(con, "DROP TABLE IF EXISTS date_dim")
## [1] 0
# Create customer_dim
dbExecute(con, "
CREATE TABLE customer_dim (
customer_id INTEGER PRIMARY KEY,
customer_name TEXT,
location TEXT,
signup_date DATE
)")
## [1] 0
# Create product_dim
dbExecute(con, "
CREATE TABLE product_dim (
product_id INTEGER PRIMARY KEY,
product_name TEXT,
category TEXT,
price REAL
)")
## [1] 0
# Create date_dim
dbExecute(con, "
CREATE TABLE date_dim (
date_id INTEGER PRIMARY KEY,
full_date DATE
)")
## [1] 0
# Create sales_fact
dbExecute(con, "
CREATE TABLE sales_fact (
sale_id INTEGER PRIMARY KEY,
customer_id INTEGER,
product_id INTEGER,
date_id INTEGER,
quantity_sold INTEGER,
total_price REAL,
FOREIGN KEY (customer_id) REFERENCES customer_dim(customer_id),
FOREIGN KEY (product_id) REFERENCES product_dim(product_id),
FOREIGN KEY (date_id) REFERENCES date_dim(date_id)
)")
## [1] 0
# Load data
dbWriteTable(con, "customer_dim", customers, append = TRUE)
dbWriteTable(con, "product_dim", products, append = TRUE)
dbWriteTable(con, "date_dim", date_dim, append = TRUE)
dbWriteTable(con, "sales_fact", sales, append = TRUE)
summary(customers)
## customer_id customer_name location signup_date
## Min. :101 Length:5 Length:5 Min. :2019-08-21
## 1st Qu.:102 Class :character Class :character 1st Qu.:2020-05-10
## Median :103 Mode :character Mode :character Median :2020-07-30
## Mean :103 Mean :2020-10-03
## 3rd Qu.:104 3rd Qu.:2021-01-05
## Max. :105 Max. :2022-03-15
summary(products)
## product_id product_name category price
## Min. :201 Length:5 Length:5 Min. : 25
## 1st Qu.:202 Class :character Class :character 1st Qu.: 50
## Median :203 Mode :character Mode :character Median : 80
## Mean :203 Mean : 271
## 3rd Qu.:204 3rd Qu.: 200
## Max. :205 Max. :1000
summary(sales)
## sale_id customer_id product_id date_id quantity_sold
## Min. :1 Min. :101 Min. :201 Min. :301 Min. :1.0
## 1st Qu.:2 1st Qu.:102 1st Qu.:202 1st Qu.:302 1st Qu.:2.0
## Median :3 Median :103 Median :203 Median :303 Median :2.0
## Mean :3 Mean :103 Mean :203 Mean :303 Mean :2.6
## 3rd Qu.:4 3rd Qu.:104 3rd Qu.:204 3rd Qu.:304 3rd Qu.:3.0
## Max. :5 Max. :105 Max. :205 Max. :305 Max. :5.0
## total_price
## Min. : 25.5
## 1st Qu.: 40.0
## Median : 50.0
## Mean : 58.1
## 3rd Qu.: 75.0
## Max. :100.0
ggplot(sales, aes(x = quantity_sold)) +
geom_histogram(binwidth = 1, fill = "steelblue") +
labs(title = "Distribution of Quantity Sold", x = "Quantity Sold", y = "Frequency") +
theme_minimal()
ggplot(sales, aes(x = total_price)) +
geom_histogram(binwidth = 10, fill = "darkgreen") +
labs(title = "Distribution of Total Price", x = "Total Price", y = "Frequency") +
theme_minimal()
dbDisconnect(con)
This R Markdown report demonstrates the end-to-end development of a data warehouse, from schema design and ETL processing to loading and visualizing cleaned retail data. By using R and SQLite, the process remains transparent and reproducible, making it ideal for documentation and auditing.