INTRODUCTION

This synthetic dataset was downloaded from kaggle and it simulates retail and online sales. It contains:

AIM

To clean Nike sales messy dataset.

OBJECTIVES

Step1: Read the CSV file

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
uncleaned_data<- read.csv("/Users/mac/Downloads/Nike_Sales_Uncleaned.csv")
glimpse(uncleaned_data)
## Rows: 2,500
## Columns: 13
## $ Order_ID         <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
## $ Gender_Category  <chr> "Kids", "Women", "Women", "Kids", "Kids", "Women", "M…
## $ Product_Line     <chr> "Training", "Soccer", "Soccer", "Lifestyle", "Running…
## $ Product_Name     <chr> "SuperRep Go", "Tiempo Legend", "Premier III", "Blaze…
## $ Size             <chr> "M", "M", "M", "L", "XL", "M", "M", "M", "11", "M", "…
## $ Units_Sold       <int> NA, 3, 4, NA, NA, 1, NA, 1, -1, 4, 1, -1, -1, NA, NA,…
## $ MRP              <dbl> NA, 4957.93, NA, 9673.57, NA, 7363.96, 6819.78, NA, N…
## $ Discount_Applied <dbl> 0.47, NA, NA, NA, NA, NA, NA, 0.32, NA, NA, NA, NA, 0…
## $ Revenue          <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ Order_Date       <chr> "3/9/2024", "7/9/2024", "", "4/10/2024", "9/12/2024",…
## $ Sales_Channel    <chr> "Online", "Retail", "Retail", "Online", "Retail", "Re…
## $ Region           <chr> "bengaluru", "Hyd", "Mumbai", "Pune", "Delhi", "Delhi…
## $ Profit           <dbl> -770.45, -112.53, 3337.34, 3376.85, 187.89, 1415.98, …

Step2: Check for the sums of missing values in each column

colSums(is.na(uncleaned_data))
##         Order_ID  Gender_Category     Product_Line     Product_Name 
##                0                0                0                0 
##             Size       Units_Sold              MRP Discount_Applied 
##                0             1235             1254             1668 
##          Revenue       Order_Date    Sales_Channel           Region 
##                0                0                0                0 
##           Profit 
##                0

Step4:Replace the blank cells with NA

uncleaned_data[uncleaned_data==""] <- NA
glimpse(uncleaned_data)
## Rows: 2,500
## Columns: 13
## $ Order_ID         <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
## $ Gender_Category  <chr> "Kids", "Women", "Women", "Kids", "Kids", "Women", "M…
## $ Product_Line     <chr> "Training", "Soccer", "Soccer", "Lifestyle", "Running…
## $ Product_Name     <chr> "SuperRep Go", "Tiempo Legend", "Premier III", "Blaze…
## $ Size             <chr> "M", "M", "M", "L", "XL", "M", "M", "M", "11", "M", "…
## $ Units_Sold       <int> NA, 3, 4, NA, NA, 1, NA, 1, -1, 4, 1, -1, -1, NA, NA,…
## $ MRP              <dbl> NA, 4957.93, NA, 9673.57, NA, 7363.96, 6819.78, NA, N…
## $ Discount_Applied <dbl> 0.47, NA, NA, NA, NA, NA, NA, 0.32, NA, NA, NA, NA, 0…
## $ Revenue          <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ Order_Date       <chr> "3/9/2024", "7/9/2024", NA, "4/10/2024", "9/12/2024",…
## $ Sales_Channel    <chr> "Online", "Retail", "Retail", "Online", "Retail", "Re…
## $ Region           <chr> "bengaluru", "Hyd", "Mumbai", "Pune", "Delhi", "Delhi…
## $ Profit           <dbl> -770.45, -112.53, 3337.34, 3376.85, 187.89, 1415.98, …

Step5:Clean the size column

uncleaned_data$Size<- as.roman(uncleaned_data$Size)# to convert everything to roman numeral. 
glimpse(uncleaned_data$Size)
##  'roman' int [1:2500] M  M  M  L  XL M  M  M  XI M  ...
uncleaned_data$Size[is.na(uncleaned_data$Size)] <- "M"
sum(is.na(uncleaned_data$Size))
## [1] 0
#we no longer have missing values in the column "Size"

Step6:Clean the Units_Sold column

library(dplyr)

max(uncleaned_data$Units_Sold,na.rm=TRUE)# to check the maximum value in the column
## [1] 4
min(uncleaned_data$Units_Sold,na.rm=TRUE)# to check the minimum value  in the column
## [1] -1
#I checked for the range in order to normalize the column
uncleaned_data$Units_Sold[uncleaned_data$Units_Sold== "0"|uncleaned_data$Units_Sold=="-1"] <- 1# to replace negative values or 0 with 1 

mean <- round(mean(uncleaned_data$Units_Sold, na.rm = TRUE),1)
mean
## [1] 2
#Replace the NA's with value
sum(is.na(uncleaned_data$Units_Sold))
## [1] 1235
uncleaned_data$Units_Sold[is.na(uncleaned_data$Units_Sold)] <- mean #replacing the NA's with mean
glimpse(uncleaned_data$Units_Sold)
##  num [1:2500] 2 3 4 2 2 1 2 1 1 4 ...
#we no longer have missing values 

Step7: Clean the MRP column

sum(is.na(uncleaned_data$MRP))
## [1] 1254
uncleaned_data$MRP[is.na(uncleaned_data$MRP)] <- mean(uncleaned_data$MRP,na.rm=TRUE)
uncleaned_data$MRP <- round(uncleaned_data$MRP,digits=0)
glimpse(uncleaned_data$MRP)
##  num [1:2500] 6040 4958 6040 9674 6040 ...

Step8:Clean the discount_applied

uncleaned_data$Discount_Applied[is.na(uncleaned_data$Discount_Applied)] <- median(uncleaned_data$Discount_Applied,na.rm=TRUE)
#some discounts are over 100percent 
uncleaned_data$Discount_Applied[which(uncleaned_data$Discount_Applied>="1.00")] <- 0.04
which(uncleaned_data$Discount_Applied>="1.0")
## integer(0)
glimpse(uncleaned_data$Discount_Applied)
##  num [1:2500] 0.47 0.615 0.615 0.615 0.615 0.615 0.615 0.32 0.615 0.615 ...
#we no longer haVe discounts greater than 100%

Step 9:Clean and adjust the revenue column

uncleaned_data$Revenue <- (uncleaned_data$Units_Sold*uncleaned_data$MRP *(1-uncleaned_data$Discount_Applied)) # formula for calculating Revenue(Units_sold x MRP) x (1- Discount Applied)
head(uncleaned_data$Revenue)
## [1] 6402.40 5726.49 9301.60 7448.98 4650.80 2835.14
uncleaned_data$Revenue <- round(uncleaned_data$Revenue,digits=0)
glimpse(uncleaned_data$Revenue)
##  num [1:2500] 6402 5726 9302 7449 4651 ...

Step 10: Clean and adjust the date format and column

library(lubridate)
uncleaned_data$Order_Date <- parse_date_time(uncleaned_data$Order_Date,
                                  orders = c("ymd", "dmy", "Y-m-d", "B d Y"),
                                  tz = "UTC")
## Warning: 740 failed to parse.
uncleaned_data$Order_Date[is.na(uncleaned_data$Order_Date)] <-as.Date("2024-10-11")
uncleaned_data$Order_Date <- as.Date(uncleaned_data$Order_Date, format="%Y-%m-%d")
glimpse(uncleaned_data$Order_Date)
##  Date[1:2500], format: "2024-09-03" "2024-09-07" "2024-10-11" "2024-10-04" "2024-12-09" ...

Step 11:Clean the Region column

library(stringr)
uncleaned_data<-uncleaned_data %>%
  mutate(
    Region = str_trim(str_to_title(Region))
  )
glimpse(uncleaned_data$Region)
##  chr [1:2500] "Bengaluru" "Hyd" "Mumbai" "Pune" "Delhi" "Delhi" "Bangalore" ...

Clean the profit(the profits are unrealistic)

glimpse(uncleaned_data$Profit)
##  num [1:2500] -770 -113 3337 3377 188 ...

From here we have negative values.

Profit is the difference between revenue and expenses.

To calculate profit, I need Cost per unit, although I was given units sold it is a key component for the selling side(revenue) but it does not provide information about what was paid for the product.

So I would drop the profit column

uncleaned_data <- uncleaned_data[,-13]
glimpse(uncleaned_data)
## Rows: 2,500
## Columns: 12
## Warning in .numeric2roman(x): NAs introduced by coercion
## $ Order_ID         <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
## $ Gender_Category  <chr> "Kids", "Women", "Women", "Kids", "Kids", "Women", "M…
## $ Product_Line     <chr> "Training", "Soccer", "Soccer", "Lifestyle", "Running…
## $ Product_Name     <chr> "SuperRep Go", "Tiempo Legend", "Premier III", "Blaze…
## $ Size             <roman> M, M, M, L, XL, M, M, M, XI, M, L, L, NA, NA, XI, N…
## $ Units_Sold       <dbl> 2, 3, 4, 2, 2, 1, 2, 1, 1, 4, 1, 1, 1, 2, 2, 2, 1, 4,…
## $ MRP              <dbl> 6040, 4958, 6040, 9674, 6040, 7364, 6820, 6040, 6040,…
## $ Discount_Applied <dbl> 0.470, 0.615, 0.615, 0.615, 0.615, 0.615, 0.615, 0.32…
## $ Revenue          <dbl> 6402, 5726, 9302, 7449, 4651, 2835, 5251, 4107, 2325,…
## $ Order_Date       <date> 2024-09-03, 2024-09-07, 2024-10-11, 2024-10-04, 2024…
## $ Sales_Channel    <chr> "Online", "Retail", "Retail", "Online", "Retail", "Re…
## $ Region           <chr> "Bengaluru", "Hyd", "Mumbai", "Pune", "Delhi", "Delhi…
colSums(is.na(uncleaned_data))
##         Order_ID  Gender_Category     Product_Line     Product_Name 
##                0                0                0                0 
##             Size       Units_Sold              MRP Discount_Applied 
##                0                0                0                0 
##          Revenue       Order_Date    Sales_Channel           Region 
##                0                0                0                0

Let’s save the cleaned dataset

write.csv(uncleaned_data,"Nike sales cleaned data.csv",row.names=FALSE)
## Warning in .numeric2roman(x): NAs introduced by coercion

CONCLUSION

I have been able to clean Nike Sales Messy dataset.