End to End Data Analytics project

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(corrplot)

## corrplot 0.95 loaded

# Load the dataset
df <- read.csv("Data/sales_data.csv") %>% clean_names()
head(df, 5)

##       date year    month customer_age customer_gender       country      state
## 1 02/19/16 2016 February           29               F United States Washington
## 2 02/20/16 2016 February           29               F United States Washington
## 3 02/27/16 2016 February           29               F United States Washington
## 4 03-12-16 2016    March           29               F United States Washington
## 5 03-12-16 2016    March           29               F United States Washington
##   product_category    sub_category quantity unit_cost unit_price cost revenue
## 1      Accessories Tires and Tubes        1     80.00  109.00000   80     109
## 2         Clothing          Gloves        2     24.50   28.50000   49      57
## 3      Accessories Tires and Tubes        3      3.67    5.00000   11      15
## 4      Accessories Tires and Tubes        2     87.50  116.50000  175     233
## 5      Accessories Tires and Tubes        3     35.00   41.66667  105     125
##   column1
## 1      NA
## 2      NA
## 3      NA
## 4      NA
## 5      NA

# Count missing values in each column
colSums(is.na(df))

##             date             year            month     customer_age 
##                0                1                0                1 
##  customer_gender          country            state product_category 
##                0                0                0                0 
##     sub_category         quantity        unit_cost       unit_price 
##                0                1                1                1 
##             cost          revenue          column1 
##                1                0            32293

# Replace missing values with 0
df[is.na(df)] <- 0

# View all duplicated rows in the data frame
duplicates <- df[duplicated(df), ]
print(duplicates)

##  [1] date             year             month            customer_age    
##  [5] customer_gender  country          state            product_category
##  [9] sub_category     quantity         unit_cost        unit_price      
## [13] cost             revenue          column1         
## <0 rows> (or 0-length row.names)

# Remove duplicates
df <- df %>% distinct()

# Feature Engineering
df <- df %>%
  mutate(
    profit_per_transaction = revenue - cost,
    profit_margin = (profit_per_transaction / revenue) * 100,
    age_group = cut(
      customer_age,
      breaks = c(-Inf, 17, 24, 34, 44, 54, 64, Inf),
      labels = c("<18", "18-24", "25-34", "35-44", "45-54", "55-64", "65+")
    )
  )

# Step 3: EDA - Central Tendency
summary_stats <- df %>%
  summarise(
    cost_mean = mean(cost, na.rm = TRUE),
    cost_median = median(cost, na.rm = TRUE),
    cost_mode = as.numeric(names(sort(table(cost), decreasing = TRUE)[1])),
    revenue_mean = mean(revenue, na.rm = TRUE),
    revenue_median = median(revenue, na.rm = TRUE),
    revenue_mode = as.numeric(names(sort(table(revenue), decreasing = TRUE)[1]))
  )
print(summary_stats)

##   cost_mean cost_median cost_mode revenue_mean revenue_median revenue_mode
## 1   575.988         261       540     640.8701            319            6

# Histogram of Customer Age
ggplot(df, aes(x = customer_age)) +
  geom_histogram(bins = 30, fill = 'skyblue', color = 'black') +
  labs(title = "Customer Age Distribution", x = "Age", y = "Frequency")

# Correlation Heatmap
numeric_df <- df %>% select(where(is.numeric)) %>% drop_na()
cor_matrix <- cor(numeric_df)
corrplot(cor_matrix, method = "color", tl.cex = 0.7)

# Univariate: Age Group
ggplot(df, aes(x = age_group)) +
  geom_bar(fill = "purple") +
  labs(title = "Distribution by Age Group")

# Univariate: Customer Gender
ggplot(df, aes(x = customer_gender)) +
  geom_bar(fill = "orange") +
  labs(title = "Gender Distribution")

# Bivariate: Revenue by Gender
ggplot(df, aes(x = customer_gender, y = revenue, fill = customer_gender)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Revenue by Gender")

# Multivariate: Revenue by Age Group & Gender
ggplot(df, aes(x = age_group, y = revenue, fill = customer_gender)) +
  geom_bar(stat = "summary", fun = "mean", position = "dodge") +
  labs(title = "Avg Revenue by Age Group and Gender")

# Convert date column
df$date <- mdy(df$date)
df$month_year <- format(df$date, "%Y-%m")

# Revenue over time
monthly_revenue <- df %>%
  group_by(month_year) %>%
  summarise(total_revenue = sum(revenue, na.rm = TRUE))

ggplot(monthly_revenue, aes(x = month_year, y = total_revenue)) +
  geom_line(group = 1, color = "blue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Monthly Revenue Trend", x = "Month-Year", y = "Revenue")

# Revenue by Product Category
ggplot(df, aes(x = product_category, y = revenue, fill = product_category)) +
  geom_bar(stat = "summary", fun = "sum") +
  labs(title = "Revenue by Product Category")

# Revenue by Age Group
ggplot(df, aes(x = age_group, y = revenue, fill = age_group)) +
  geom_bar(stat = "summary", fun = "sum") +
  labs(title = "Revenue by Age Group")

# Revenue vs. Profit
ggplot(df, aes(x = revenue, y = profit_per_transaction)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Revenue vs. Profit")

## `geom_smooth()` using formula = 'y ~ x'

# Top 10 Countries by Revenue
top_countries <- df %>%
  group_by(country) %>%
  summarise(total_revenue = sum(revenue, na.rm = TRUE)) %>%
  slice_max(order_by = total_revenue, n = 10)

ggplot(top_countries, aes(x = reorder(country, total_revenue), y = total_revenue)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Countries by Revenue")

End to End Data Analytics project

Zephania Mwangi

2025-07-02