1. ETL Process
Step 1: Extract
I start by importing 3 sheet from an excel file containing customer, product and sales data.
setwd("C:/Users/ciane/OneDrive/Documents/MS_DataAnalytics_MCDaniel/ANA_515/Week_8/R")
sales_data <- read_excel("ANA_515_Week_5_Activity.xlsx", sheet = "sales")
product_data <- read_excel("ANA_515_Week_5_Activity.xlsx", sheet = "products")
customer_data <- read_excel("ANA_515_Week_5_Activity.xlsx", sheet = "customers")
Step 2: Transform
I performed basic data cleaning:
clean_names <- function(df) {
df <- df %>%
rename_all(~gsub(" ", "_", .)) %>% # Clean column names
distinct() %>% # Remove duplicates
na.omit() # Remove missing values
return(df)
}
sales_data <- clean_names(sales_data)
customer_data <- clean_names(customer_data)
product_data <- clean_names(product_data)
# Standardized date formats
sales_data <- sales_data %>%
mutate(sales_date = ymd(sales_date)) %>%
mutate(date_id = as.integer(factor(sales_date))) # assign date_id
Step 3: Load
I used an SQLite database to load the cleaned data.
# Connect to SQLite DB (can be any other DB with right driver)
con <- dbConnect(SQLite(), "retail_data_warehouse.db")
# convert date field into character before load in DB to prevent it to be stored in integer format
sales_data$sales_date <- as.character(sales_data$sales_date)
# Write data frames into SQL tables
dbWriteTable(con, "customer_dim", customer_data, overwrite = TRUE)
dbWriteTable(con, "product_dim", product_data, overwrite = TRUE)
dbWriteTable(con, "sales_fact", sales_data, overwrite = TRUE)
2. Create database Objects (fact and dimension tables)
I used SSMS to create customer, product, and date dimension tables and sales fact table
CREATE TABLE customer_dim_SQL (
customer_id INTEGER,
customer_name TEXT,
email VARCHAR(100),
location TEXT
);
CREATE TABLE product_dim_SQL (
product_id INT PRIMARY KEY,
product_name VARCHAR(100),
category VARCHAR(50),
price DECIMAL(10,2)
);
CREATE TABLE date_dim_SQL (
date_id INT PRIMARY KEY,
date DATE,
year INT,
month INT,
day INT
);
CREATE TABLE sales_fact_SQL (
sale_id INT PRIMARY KEY,
customer_id INT,
product_id INT,
date_id INT,
quantity_sold INT,
total_price DECIMAL(10,2),
FOREIGN KEY (customer_id) REFERENCES customer_dim(customer_id),
FOREIGN KEY (product_id) REFERENCES product_dim(product_id),
FOREIGN KEY (date_id) REFERENCES date_dim(date_id)
);
3. Insert Sample Data
After creating the tables, inserted values into these objects
INSERT INTO customer_dim_SQL VALUES
(1, 'Claire Gute', 'claire@gmail.com', 'New York'),
(2, 'Darrin Van Huff', 'huffD@gmail.com', 'Los Angeles'),
(3, 'Brosina Hoffman', 'hoffman@gmail.com', 'Chicago');
INSERT INTO product_dim_SQL VALUES
(1, 'Bush Somerset Collection Bookcase', 'Furniture', 25.99),
(2, 'Running Shoes', 'Apparel', 89.99),
(3, 'Xerox 1995', 'Office Supplies', 24.80);
INSERT INTO date_dim_SQL VALUES
(1, '2025-07-01', 2025, 7, 1),
(2, '2025-07-02', 2025, 7, 2),
(3, '2025-07-03', 2025, 7, 3);
INSERT INTO sales_fact_SQL VALUES
(1, 1, 1, 1, 2, 51.98),
(2, 2, 2, 2, 1, 89.99),
(3, 3, 3, 3, 5, 124.00);
4. Summary Statistics
Let us look at basic summaries of our data.
{r summary(sales_data)
5. Visualization
ggplot(sales_data, aes(x = quantity_sold)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
theme_minimal()
6. Conclusion
This report documented a simple ETL pipeline for loading customer order data from an excel file with 3 sheets into a SQL database. We performed basic cleaning, loaded data using dbWriteTable, and created summaries and visualizations.
{r dbDisconnect(con)