# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(corrplot)
## corrplot 0.95 loaded
# Load the dataset
df <- read.csv("Data/sales_data.csv") %>% clean_names()
head(df, 5)
## date year month customer_age customer_gender country state
## 1 02/19/16 2016 February 29 F United States Washington
## 2 02/20/16 2016 February 29 F United States Washington
## 3 02/27/16 2016 February 29 F United States Washington
## 4 03-12-16 2016 March 29 F United States Washington
## 5 03-12-16 2016 March 29 F United States Washington
## product_category sub_category quantity unit_cost unit_price cost revenue
## 1 Accessories Tires and Tubes 1 80.00 109.00000 80 109
## 2 Clothing Gloves 2 24.50 28.50000 49 57
## 3 Accessories Tires and Tubes 3 3.67 5.00000 11 15
## 4 Accessories Tires and Tubes 2 87.50 116.50000 175 233
## 5 Accessories Tires and Tubes 3 35.00 41.66667 105 125
## column1
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
# Count missing values in each column
colSums(is.na(df))
## date year month customer_age
## 0 1 0 1
## customer_gender country state product_category
## 0 0 0 0
## sub_category quantity unit_cost unit_price
## 0 1 1 1
## cost revenue column1
## 1 0 32293
# Replace missing values with 0
df[is.na(df)] <- 0
# View all duplicated rows in the data frame
duplicates <- df[duplicated(df), ]
print(duplicates)
## [1] date year month customer_age
## [5] customer_gender country state product_category
## [9] sub_category quantity unit_cost unit_price
## [13] cost revenue column1
## <0 rows> (or 0-length row.names)
# Remove duplicates
df <- df %>% distinct()
# Feature Engineering
df <- df %>%
mutate(
profit_per_transaction = revenue - cost,
profit_margin = (profit_per_transaction / revenue) * 100,
age_group = cut(
customer_age,
breaks = c(-Inf, 17, 24, 34, 44, 54, 64, Inf),
labels = c("<18", "18-24", "25-34", "35-44", "45-54", "55-64", "65+")
)
)
# Step 3: EDA - Central Tendency
summary_stats <- df %>%
summarise(
cost_mean = mean(cost, na.rm = TRUE),
cost_median = median(cost, na.rm = TRUE),
cost_mode = as.numeric(names(sort(table(cost), decreasing = TRUE)[1])),
revenue_mean = mean(revenue, na.rm = TRUE),
revenue_median = median(revenue, na.rm = TRUE),
revenue_mode = as.numeric(names(sort(table(revenue), decreasing = TRUE)[1]))
)
print(summary_stats)
## cost_mean cost_median cost_mode revenue_mean revenue_median revenue_mode
## 1 575.988 261 540 640.8701 319 6
# Histogram of Customer Age
ggplot(df, aes(x = customer_age)) +
geom_histogram(bins = 30, fill = 'skyblue', color = 'black') +
labs(title = "Customer Age Distribution", x = "Age", y = "Frequency")

# Correlation Heatmap
numeric_df <- df %>% select(where(is.numeric)) %>% drop_na()
cor_matrix <- cor(numeric_df)
corrplot(cor_matrix, method = "color", tl.cex = 0.7)

# Univariate: Age Group
ggplot(df, aes(x = age_group)) +
geom_bar(fill = "purple") +
labs(title = "Distribution by Age Group")

# Univariate: Customer Gender
ggplot(df, aes(x = customer_gender)) +
geom_bar(fill = "orange") +
labs(title = "Gender Distribution")

# Bivariate: Revenue by Gender
ggplot(df, aes(x = customer_gender, y = revenue, fill = customer_gender)) +
geom_boxplot(show.legend = FALSE) +
labs(title = "Revenue by Gender")

# Multivariate: Revenue by Age Group & Gender
ggplot(df, aes(x = age_group, y = revenue, fill = customer_gender)) +
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
labs(title = "Avg Revenue by Age Group and Gender")

# Convert date column
df$date <- mdy(df$date)
df$month_year <- format(df$date, "%Y-%m")
# Revenue over time
monthly_revenue <- df %>%
group_by(month_year) %>%
summarise(total_revenue = sum(revenue, na.rm = TRUE))
ggplot(monthly_revenue, aes(x = month_year, y = total_revenue)) +
geom_line(group = 1, color = "blue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Monthly Revenue Trend", x = "Month-Year", y = "Revenue")

# Revenue by Product Category
ggplot(df, aes(x = product_category, y = revenue, fill = product_category)) +
geom_bar(stat = "summary", fun = "sum") +
labs(title = "Revenue by Product Category")

# Revenue by Age Group
ggplot(df, aes(x = age_group, y = revenue, fill = age_group)) +
geom_bar(stat = "summary", fun = "sum") +
labs(title = "Revenue by Age Group")

# Revenue vs. Profit
ggplot(df, aes(x = revenue, y = profit_per_transaction)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Revenue vs. Profit")
## `geom_smooth()` using formula = 'y ~ x'

# Top 10 Countries by Revenue
top_countries <- df %>%
group_by(country) %>%
summarise(total_revenue = sum(revenue, na.rm = TRUE)) %>%
slice_max(order_by = total_revenue, n = 10)
ggplot(top_countries, aes(x = reorder(country, total_revenue), y = total_revenue)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Countries by Revenue")
