Lesson 8: Building and Documenting a Data Warehouse

📦 Introduction

This report documents the full ETL (Extract, Transform, Load) process for building a data warehouse for an online retail company. It includes schema creation, data cleaning, and loading the processed data into a SQLite database. Visualizations and summary statistics are also provided.

Step 1: Load Required Libraries

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(DBI)
library(RSQLite)
library(ggplot2)

Step 2: Extract Data from Excel

# Define file path
file_path <- "/Users/calebazonsi/Downloads/ANA 515 Week 8 Activity.xlsx"

# Read Excel sheets
sales <- read_excel(file_path, sheet = "sales")
customers <- read_excel(file_path, sheet = "customers")
products <- read_excel(file_path, sheet = "products")

Step 3: Clean and Transform Data

# Standardize column names
names(sales) <- tolower(names(sales))
names(customers) <- tolower(names(customers))
names(products) <- tolower(names(products))

# Remove duplicates
sales <- distinct(sales)
customers <- distinct(customers)
products <- distinct(products)

# Convert signup_date to Date format
customers$signup_date <- as.Date(customers$signup_date)

# Create date_dim table from date_id
date_dim <- sales %>%
  select(date_id) %>%
  distinct() %>%
  arrange(date_id) %>%
  mutate(full_date = seq.Date(from = as.Date("2023-01-01"), by = "day", length.out = n()))

️ Step 4: Create and Load SQLite Tables

# Connect to SQLite
con <- dbConnect(SQLite(), dbname = "retail_dw.sqlite")

# Drop tables if re-running
dbExecute(con, "DROP TABLE IF EXISTS sales_fact")

## [1] 0

dbExecute(con, "DROP TABLE IF EXISTS customer_dim")

## [1] 0

dbExecute(con, "DROP TABLE IF EXISTS product_dim")

## [1] 0

dbExecute(con, "DROP TABLE IF EXISTS date_dim")

## [1] 0

# Create customer_dim
dbExecute(con, "
  CREATE TABLE customer_dim (
    customer_id INTEGER PRIMARY KEY,
    customer_name TEXT,
    location TEXT,
    signup_date DATE
  )")

## [1] 0

# Create product_dim
dbExecute(con, "
  CREATE TABLE product_dim (
    product_id INTEGER PRIMARY KEY,
    product_name TEXT,
    category TEXT,
    price REAL
  )")

## [1] 0

# Create date_dim
dbExecute(con, "
  CREATE TABLE date_dim (
    date_id INTEGER PRIMARY KEY,
    full_date DATE
  )")

## [1] 0

# Create sales_fact
dbExecute(con, "
  CREATE TABLE sales_fact (
    sale_id INTEGER PRIMARY KEY,
    customer_id INTEGER,
    product_id INTEGER,
    date_id INTEGER,
    quantity_sold INTEGER,
    total_price REAL,
    FOREIGN KEY (customer_id) REFERENCES customer_dim(customer_id),
    FOREIGN KEY (product_id) REFERENCES product_dim(product_id),
    FOREIGN KEY (date_id) REFERENCES date_dim(date_id)
  )")

## [1] 0

# Load data
dbWriteTable(con, "customer_dim", customers, append = TRUE)
dbWriteTable(con, "product_dim", products, append = TRUE)
dbWriteTable(con, "date_dim", date_dim, append = TRUE)
dbWriteTable(con, "sales_fact", sales, append = TRUE)

Step 5: Summary Statistics

summary(customers)

##   customer_id  customer_name        location          signup_date        
##  Min.   :101   Length:5           Length:5           Min.   :2019-08-21  
##  1st Qu.:102   Class :character   Class :character   1st Qu.:2020-05-10  
##  Median :103   Mode  :character   Mode  :character   Median :2020-07-30  
##  Mean   :103                                         Mean   :2020-10-03  
##  3rd Qu.:104                                         3rd Qu.:2021-01-05  
##  Max.   :105                                         Max.   :2022-03-15

summary(products)

##    product_id  product_name         category             price     
##  Min.   :201   Length:5           Length:5           Min.   :  25  
##  1st Qu.:202   Class :character   Class :character   1st Qu.:  50  
##  Median :203   Mode  :character   Mode  :character   Median :  80  
##  Mean   :203                                         Mean   : 271  
##  3rd Qu.:204                                         3rd Qu.: 200  
##  Max.   :205                                         Max.   :1000

summary(sales)

##     sale_id   customer_id    product_id     date_id    quantity_sold
##  Min.   :1   Min.   :101   Min.   :201   Min.   :301   Min.   :1.0  
##  1st Qu.:2   1st Qu.:102   1st Qu.:202   1st Qu.:302   1st Qu.:2.0  
##  Median :3   Median :103   Median :203   Median :303   Median :2.0  
##  Mean   :3   Mean   :103   Mean   :203   Mean   :303   Mean   :2.6  
##  3rd Qu.:4   3rd Qu.:104   3rd Qu.:204   3rd Qu.:304   3rd Qu.:3.0  
##  Max.   :5   Max.   :105   Max.   :205   Max.   :305   Max.   :5.0  
##   total_price   
##  Min.   : 25.5  
##  1st Qu.: 40.0  
##  Median : 50.0  
##  Mean   : 58.1  
##  3rd Qu.: 75.0  
##  Max.   :100.0

Step 6: Data Visualizations

Quantity Sold Distribution

ggplot(sales, aes(x = quantity_sold)) +
  geom_histogram(binwidth = 1, fill = "steelblue") +
  labs(title = "Distribution of Quantity Sold", x = "Quantity Sold", y = "Frequency") +
  theme_minimal()

Total Price Distribution

ggplot(sales, aes(x = total_price)) +
  geom_histogram(binwidth = 10, fill = "darkgreen") +
  labs(title = "Distribution of Total Price", x = "Total Price", y = "Frequency") +
  theme_minimal()

Step 7: Disconnect from Database

dbDisconnect(con)

🧾 Conclusion

This R Markdown report demonstrates the end-to-end development of a data warehouse, from schema design and ETL processing to loading and visualizing cleaned retail data. By using R and SQLite, the process remains transparent and reproducible, making it ideal for documentation and auditing.