INTRODUCTION
This synthetic dataset was downloaded from kaggle and it simulates retail and online sales. It contains:
Multiple product lines(Running,Basketball,Lifestyle,Training, Soccer,etc).
Gender Specific Sales(Men, Women, Kids).
Sales from both retail stores and online channels.
AIM
To clean Nike sales messy dataset.
OBJECTIVES
Identify all data issues
Produce a very clean dataset ready for Exploratory Data Analysis
Step1: Read the CSV file
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
uncleaned_data<- read.csv("/Users/mac/Downloads/Nike_Sales_Uncleaned.csv")
glimpse(uncleaned_data)
## Rows: 2,500
## Columns: 13
## $ Order_ID <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
## $ Gender_Category <chr> "Kids", "Women", "Women", "Kids", "Kids", "Women", "M…
## $ Product_Line <chr> "Training", "Soccer", "Soccer", "Lifestyle", "Running…
## $ Product_Name <chr> "SuperRep Go", "Tiempo Legend", "Premier III", "Blaze…
## $ Size <chr> "M", "M", "M", "L", "XL", "M", "M", "M", "11", "M", "…
## $ Units_Sold <int> NA, 3, 4, NA, NA, 1, NA, 1, -1, 4, 1, -1, -1, NA, NA,…
## $ MRP <dbl> NA, 4957.93, NA, 9673.57, NA, 7363.96, 6819.78, NA, N…
## $ Discount_Applied <dbl> 0.47, NA, NA, NA, NA, NA, NA, 0.32, NA, NA, NA, NA, 0…
## $ Revenue <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ Order_Date <chr> "3/9/2024", "7/9/2024", "", "4/10/2024", "9/12/2024",…
## $ Sales_Channel <chr> "Online", "Retail", "Retail", "Online", "Retail", "Re…
## $ Region <chr> "bengaluru", "Hyd", "Mumbai", "Pune", "Delhi", "Delhi…
## $ Profit <dbl> -770.45, -112.53, 3337.34, 3376.85, 187.89, 1415.98, …
Step2: Check for the sums of missing values in each column
colSums(is.na(uncleaned_data))
## Order_ID Gender_Category Product_Line Product_Name
## 0 0 0 0
## Size Units_Sold MRP Discount_Applied
## 0 1235 1254 1668
## Revenue Order_Date Sales_Channel Region
## 0 0 0 0
## Profit
## 0
Step4:Replace the blank cells with NA
uncleaned_data[uncleaned_data==""] <- NA
glimpse(uncleaned_data)
## Rows: 2,500
## Columns: 13
## $ Order_ID <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
## $ Gender_Category <chr> "Kids", "Women", "Women", "Kids", "Kids", "Women", "M…
## $ Product_Line <chr> "Training", "Soccer", "Soccer", "Lifestyle", "Running…
## $ Product_Name <chr> "SuperRep Go", "Tiempo Legend", "Premier III", "Blaze…
## $ Size <chr> "M", "M", "M", "L", "XL", "M", "M", "M", "11", "M", "…
## $ Units_Sold <int> NA, 3, 4, NA, NA, 1, NA, 1, -1, 4, 1, -1, -1, NA, NA,…
## $ MRP <dbl> NA, 4957.93, NA, 9673.57, NA, 7363.96, 6819.78, NA, N…
## $ Discount_Applied <dbl> 0.47, NA, NA, NA, NA, NA, NA, 0.32, NA, NA, NA, NA, 0…
## $ Revenue <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ Order_Date <chr> "3/9/2024", "7/9/2024", NA, "4/10/2024", "9/12/2024",…
## $ Sales_Channel <chr> "Online", "Retail", "Retail", "Online", "Retail", "Re…
## $ Region <chr> "bengaluru", "Hyd", "Mumbai", "Pune", "Delhi", "Delhi…
## $ Profit <dbl> -770.45, -112.53, 3337.34, 3376.85, 187.89, 1415.98, …
Step5:Clean the size column
uncleaned_data$Size<- as.roman(uncleaned_data$Size)# to convert everything to roman numeral.
glimpse(uncleaned_data$Size)
## 'roman' int [1:2500] M M M L XL M M M XI M ...
uncleaned_data$Size[is.na(uncleaned_data$Size)] <- "M"
sum(is.na(uncleaned_data$Size))
## [1] 0
#we no longer have missing values in the column "Size"
Step6:Clean the Units_Sold column
library(dplyr)
max(uncleaned_data$Units_Sold,na.rm=TRUE)# to check the maximum value in the column
## [1] 4
min(uncleaned_data$Units_Sold,na.rm=TRUE)# to check the minimum value in the column
## [1] -1
#I checked for the range in order to normalize the column
uncleaned_data$Units_Sold[uncleaned_data$Units_Sold== "0"|uncleaned_data$Units_Sold=="-1"] <- 1# to replace negative values or 0 with 1
mean <- round(mean(uncleaned_data$Units_Sold, na.rm = TRUE),1)
mean
## [1] 2
#Replace the NA's with value
sum(is.na(uncleaned_data$Units_Sold))
## [1] 1235
uncleaned_data$Units_Sold[is.na(uncleaned_data$Units_Sold)] <- mean #replacing the NA's with mean
glimpse(uncleaned_data$Units_Sold)
## num [1:2500] 2 3 4 2 2 1 2 1 1 4 ...
#we no longer have missing values
Step7: Clean the MRP column
sum(is.na(uncleaned_data$MRP))
## [1] 1254
uncleaned_data$MRP[is.na(uncleaned_data$MRP)] <- mean(uncleaned_data$MRP,na.rm=TRUE)
uncleaned_data$MRP <- round(uncleaned_data$MRP,digits=0)
glimpse(uncleaned_data$MRP)
## num [1:2500] 6040 4958 6040 9674 6040 ...
Step8:Clean the discount_applied
uncleaned_data$Discount_Applied[is.na(uncleaned_data$Discount_Applied)] <- median(uncleaned_data$Discount_Applied,na.rm=TRUE)
#some discounts are over 100percent
uncleaned_data$Discount_Applied[which(uncleaned_data$Discount_Applied>="1.00")] <- 0.04
which(uncleaned_data$Discount_Applied>="1.0")
## integer(0)
glimpse(uncleaned_data$Discount_Applied)
## num [1:2500] 0.47 0.615 0.615 0.615 0.615 0.615 0.615 0.32 0.615 0.615 ...
#we no longer haVe discounts greater than 100%
Step 9:Clean and adjust the revenue column
uncleaned_data$Revenue <- (uncleaned_data$Units_Sold*uncleaned_data$MRP *(1-uncleaned_data$Discount_Applied)) # formula for calculating Revenue(Units_sold x MRP) x (1- Discount Applied)
head(uncleaned_data$Revenue)
## [1] 6402.40 5726.49 9301.60 7448.98 4650.80 2835.14
uncleaned_data$Revenue <- round(uncleaned_data$Revenue,digits=0)
glimpse(uncleaned_data$Revenue)
## num [1:2500] 6402 5726 9302 7449 4651 ...
Step 10: Clean and adjust the date format and column
library(lubridate)
uncleaned_data$Order_Date <- parse_date_time(uncleaned_data$Order_Date,
orders = c("ymd", "dmy", "Y-m-d", "B d Y"),
tz = "UTC")
## Warning: 740 failed to parse.
uncleaned_data$Order_Date[is.na(uncleaned_data$Order_Date)] <-as.Date("2024-10-11")
uncleaned_data$Order_Date <- as.Date(uncleaned_data$Order_Date, format="%Y-%m-%d")
glimpse(uncleaned_data$Order_Date)
## Date[1:2500], format: "2024-09-03" "2024-09-07" "2024-10-11" "2024-10-04" "2024-12-09" ...
Step 11:Clean the Region column
library(stringr)
uncleaned_data<-uncleaned_data %>%
mutate(
Region = str_trim(str_to_title(Region))
)
glimpse(uncleaned_data$Region)
## chr [1:2500] "Bengaluru" "Hyd" "Mumbai" "Pune" "Delhi" "Delhi" "Bangalore" ...
Clean the profit(the profits are unrealistic)
glimpse(uncleaned_data$Profit)
## num [1:2500] -770 -113 3337 3377 188 ...
From here we have negative values.
Profit is the difference between revenue and expenses.
To calculate profit, I need Cost per unit, although I was given units sold it is a key component for the selling side(revenue) but it does not provide information about what was paid for the product.
So I would drop the profit column
uncleaned_data <- uncleaned_data[,-13]
glimpse(uncleaned_data)
## Rows: 2,500
## Columns: 12
## Warning in .numeric2roman(x): NAs introduced by coercion
## $ Order_ID <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
## $ Gender_Category <chr> "Kids", "Women", "Women", "Kids", "Kids", "Women", "M…
## $ Product_Line <chr> "Training", "Soccer", "Soccer", "Lifestyle", "Running…
## $ Product_Name <chr> "SuperRep Go", "Tiempo Legend", "Premier III", "Blaze…
## $ Size <roman> M, M, M, L, XL, M, M, M, XI, M, L, L, NA, NA, XI, N…
## $ Units_Sold <dbl> 2, 3, 4, 2, 2, 1, 2, 1, 1, 4, 1, 1, 1, 2, 2, 2, 1, 4,…
## $ MRP <dbl> 6040, 4958, 6040, 9674, 6040, 7364, 6820, 6040, 6040,…
## $ Discount_Applied <dbl> 0.470, 0.615, 0.615, 0.615, 0.615, 0.615, 0.615, 0.32…
## $ Revenue <dbl> 6402, 5726, 9302, 7449, 4651, 2835, 5251, 4107, 2325,…
## $ Order_Date <date> 2024-09-03, 2024-09-07, 2024-10-11, 2024-10-04, 2024…
## $ Sales_Channel <chr> "Online", "Retail", "Retail", "Online", "Retail", "Re…
## $ Region <chr> "Bengaluru", "Hyd", "Mumbai", "Pune", "Delhi", "Delhi…
colSums(is.na(uncleaned_data))
## Order_ID Gender_Category Product_Line Product_Name
## 0 0 0 0
## Size Units_Sold MRP Discount_Applied
## 0 0 0 0
## Revenue Order_Date Sales_Channel Region
## 0 0 0 0
Let’s save the cleaned dataset
write.csv(uncleaned_data,"Nike sales cleaned data.csv",row.names=FALSE)
## Warning in .numeric2roman(x): NAs introduced by coercion
CONCLUSION
I have been able to clean Nike Sales Messy dataset.