STEP 3: WRANGLING DATA, DATA CLEANING AND COMBINING INTO A SINGLE
FILE
# Comparing file structure and column names of each files, to ensure they match perfectly
colnames(Q1)
## [1] "Customer_id" "Age" "Gender" "Revenue_Total"
## [5] "N_Purchases" "Purchase_DATE" "Purchase_VALUE" "Pay_Method"
## [9] "Time_Spent" "Browser" "Newsletter" "Voucher"
colnames(Q2)
## [1] "Customer.ID" "Age" "Gender"
## [4] "Item.Purchased" "Category" "Purchase.Amount..USD."
## [7] "Location" "Size" "Color"
## [10] "Season" "Review.Rating" "Subscription.Status"
## [13] "Shipping.Type" "Discount.Applied" "Promo.Code.Used"
## [16] "Previous.Purchases" "Payment.Method" "Frequency.of.Purchases"
colnames(Q9)
## [1] "Customer ID" "Age"
## [3] "Gender" "Item Purchased"
## [5] "Category" "Purchase Amount (USD)"
## [7] "Location" "Size"
## [9] "Color" "Season"
## [11] "Review Rating" "Subscription Status"
## [13] "Payment Method" "Shipping Type"
## [15] "Discount Applied" "Promo Code Used"
## [17] "Previous Purchases" "Preferred Payment Method"
## [19] "Frequency of Purchases"
str(Q1)
## 'data.frame': 65796 obs. of 12 variables:
## $ Customer_id : int 504308 504309 504310 504311 504312 504313 504314 504315 504316 504317 ...
## $ Age : int 53 18 52 29 21 55 17 30 51 63 ...
## $ Gender : int 0 1 1 0 1 0 1 1 0 1 ...
## $ Revenue_Total : num 45.3 36.2 10.6 54.1 56.9 13.7 30.7 8.1 18 19.2 ...
## $ N_Purchases : int 2 3 1 5 1 6 6 7 4 4 ...
## $ Purchase_DATE : chr "22.06.21" "10.12.21" "14.03.21" "25.10.21" ...
## $ Purchase_VALUE: num 24.9 2.9 10.6 43.3 56.9 ...
## $ Pay_Method : int 1 2 0 1 1 1 0 3 0 3 ...
## $ Time_Spent : int 885 656 761 906 605 364 654 1011 312 828 ...
## $ Browser : int 0 0 0 0 0 1 0 0 3 0 ...
## $ Newsletter : int 0 0 1 1 1 0 0 0 1 0 ...
## $ Voucher : int 0 1 0 0 0 0 0 0 0 0 ...
str(Q2)
## 'data.frame': 3900 obs. of 18 variables:
## $ Customer.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 55 19 50 21 45 46 63 27 26 57 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Item.Purchased : chr "Blouse" "Sweater" "Jeans" "Sandals" ...
## $ Category : chr "Clothing" "Clothing" "Clothing" "Footwear" ...
## $ Purchase.Amount..USD. : int 53 64 73 90 49 20 85 34 97 31 ...
## $ Location : chr "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
## $ Size : chr "L" "L" "S" "M" ...
## $ Color : chr "Gray" "Maroon" "Maroon" "Maroon" ...
## $ Season : chr "Winter" "Winter" "Spring" "Spring" ...
## $ Review.Rating : num 3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
## $ Subscription.Status : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Shipping.Type : chr "Express" "Express" "Free Shipping" "Next Day Air" ...
## $ Discount.Applied : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Promo.Code.Used : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Previous.Purchases : int 14 2 23 49 31 14 49 19 8 4 ...
## $ Payment.Method : chr "Venmo" "Cash" "Credit Card" "PayPal" ...
## $ Frequency.of.Purchases: chr "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
str(Q9)
## spc_tbl_ [3,900 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Customer ID : num [1:3900] 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num [1:3900] 55 19 50 21 45 46 63 27 26 57 ...
## $ Gender : chr [1:3900] "Male" "Male" "Male" "Male" ...
## $ Item Purchased : chr [1:3900] "Blouse" "Sweater" "Jeans" "Sandals" ...
## $ Category : chr [1:3900] "Clothing" "Clothing" "Clothing" "Footwear" ...
## $ Purchase Amount (USD) : num [1:3900] 53 64 73 90 49 20 85 34 97 31 ...
## $ Location : chr [1:3900] "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
## $ Size : chr [1:3900] "L" "L" "S" "M" ...
## $ Color : chr [1:3900] "Gray" "Maroon" "Maroon" "Maroon" ...
## $ Season : chr [1:3900] "Winter" "Winter" "Spring" "Spring" ...
## $ Review Rating : num [1:3900] 3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
## $ Subscription Status : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
## $ Payment Method : chr [1:3900] "Credit Card" "Bank Transfer" "Cash" "PayPal" ...
## $ Shipping Type : chr [1:3900] "Express" "Express" "Free Shipping" "Next Day Air" ...
## $ Discount Applied : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
## $ Promo Code Used : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
## $ Previous Purchases : num [1:3900] 14 2 23 49 31 14 49 19 8 4 ...
## $ Preferred Payment Method: chr [1:3900] "Venmo" "Cash" "Credit Card" "PayPal" ...
## $ Frequency of Purchases : chr [1:3900] "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
## - attr(*, "spec")=
## .. cols(
## .. `Customer ID` = col_double(),
## .. Age = col_double(),
## .. Gender = col_character(),
## .. `Item Purchased` = col_character(),
## .. Category = col_character(),
## .. `Purchase Amount (USD)` = col_double(),
## .. Location = col_character(),
## .. Size = col_character(),
## .. Color = col_character(),
## .. Season = col_character(),
## .. `Review Rating` = col_double(),
## .. `Subscription Status` = col_character(),
## .. `Payment Method` = col_character(),
## .. `Shipping Type` = col_character(),
## .. `Discount Applied` = col_character(),
## .. `Promo Code Used` = col_character(),
## .. `Previous Purchases` = col_double(),
## .. `Preferred Payment Method` = col_character(),
## .. `Frequency of Purchases` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Reassigning 0 to "Not Subscribed" and 1 to "Subscribed" as initiated in the raw dataframe
Q1$Newsletter <- ifelse(Q1$Newsletter == 1, "Subscribed", "Not Subscribed")
# Reassigning "Not Subscribed" to "Normal" and "Subscribed" to "Member"
Q1$Subscription.status <- ifelse(Q1$Newsletter %in% c("Not Subscribed"), "Normal",
ifelse(Q1$Newsletter %in% c("Subscribed"), "Member",
NA))
table(Q1$Subscription.status) # Checking the tables for Subscription.Satus to ensure it is changed
##
## Member Normal
## 9937 55859
#Converting the Date from Chr to DATE format
Q1$Purchase_DATE <- dmy(Q1$Purchase_DATE)
Q1$date <- as.Date(Q1$Purchase_DATE) #The default format is yyyy-mm-dd
Q1$month <- format(as.Date(Q1$Purchase_DATE), "%m")
Q1$day <- format(as.Date(Q1$Purchase_DATE), "%d")
Q1$year <- format(as.Date(Q1$Purchase_DATE), "%Y")
Q1$day_of_week <- format(as.Date(Q1$Purchase_DATE), "%A")
Q1$month <- as.numeric(Q1$month) # Converting "month" to numeric
Q1$month <- month.name[Q1$month] #Converting the month(1,2,3...) to be month(January, Febuary, March...)
# Reassign "month" names as seasons
Q1$Season <- ifelse(Q1$month %in% c("December", "January", "February"), "Winter",
ifelse(Q1$month %in% c("March", "April", "May"), "Spring",
ifelse(Q1$month %in% c("June", "July", "August"), "Summer",
ifelse(Q1$month %in% c("September", "October", "November"), "Autumn", NA))))
Renaming Each file’s column to ensure it is consistent with each
other
## Renaming for Q1 Data
Q1 <- rename(Q1,
Customer.ID = Customer_id,
Age = Age,
Gender = Gender,
Total.Revenue = Revenue_Total,
Season = Season,
Sales.Channel = Time_Spent,
Subscription.Status = Subscription.status,
Pay.Method = Pay_Method,
Product.Category = NULL
)
# Renaming for Q2 Data
Q2 <- rename(Q2,
Customer.ID = Customer.ID,
Age = Age,
Gender = Gender,
Total.Revenue = Purchase.Amount..USD.,
Season = Season,
Sales.Channel = Shipping.Type,
Subscription.Status = Subscription.Status,
Pay.Method = Payment.Method,
Product.Category = Category
)
# Renaming for Q9 Data
Q9 <- rename(Q9,
Customer.ID = 'Customer ID',
Age = Age,
Gender = Gender,
Total.Revenue = 'Purchase Amount (USD)',
Season = Season,
Sales.Channel = 'Shipping Type',
Subscription.Status = 'Subscription Status',
Pay.Method = 'Payment Method',
Product.Category = Category
)
#checking the structure to confirm
str(Q1)
## 'data.frame': 65796 obs. of 19 variables:
## $ Customer.ID : int 504308 504309 504310 504311 504312 504313 504314 504315 504316 504317 ...
## $ Age : int 53 18 52 29 21 55 17 30 51 63 ...
## $ Gender : int 0 1 1 0 1 0 1 1 0 1 ...
## $ Total.Revenue : num 45.3 36.2 10.6 54.1 56.9 13.7 30.7 8.1 18 19.2 ...
## $ N_Purchases : int 2 3 1 5 1 6 6 7 4 4 ...
## $ Purchase_DATE : Date, format: "2021-06-22" "2021-12-10" ...
## $ Purchase_VALUE : num 24.9 2.9 10.6 43.3 56.9 ...
## $ Pay.Method : int 1 2 0 1 1 1 0 3 0 3 ...
## $ Sales.Channel : int 885 656 761 906 605 364 654 1011 312 828 ...
## $ Browser : int 0 0 0 0 0 1 0 0 3 0 ...
## $ Newsletter : chr "Not Subscribed" "Not Subscribed" "Subscribed" "Subscribed" ...
## $ Voucher : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Subscription.Status: chr "Normal" "Normal" "Member" "Member" ...
## $ date : Date, format: "2021-06-22" "2021-12-10" ...
## $ month : chr "June" "December" "March" "October" ...
## $ day : chr "22" "10" "14" "25" ...
## $ year : chr "2021" "2021" "2021" "2021" ...
## $ day_of_week : chr "Tuesday" "Friday" "Sunday" "Monday" ...
## $ Season : chr "Summer" "Winter" "Spring" "Autumn" ...
str(Q2)
## 'data.frame': 3900 obs. of 18 variables:
## $ Customer.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 55 19 50 21 45 46 63 27 26 57 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Item.Purchased : chr "Blouse" "Sweater" "Jeans" "Sandals" ...
## $ Product.Category : chr "Clothing" "Clothing" "Clothing" "Footwear" ...
## $ Total.Revenue : int 53 64 73 90 49 20 85 34 97 31 ...
## $ Location : chr "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
## $ Size : chr "L" "L" "S" "M" ...
## $ Color : chr "Gray" "Maroon" "Maroon" "Maroon" ...
## $ Season : chr "Winter" "Winter" "Spring" "Spring" ...
## $ Review.Rating : num 3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
## $ Subscription.Status : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Sales.Channel : chr "Express" "Express" "Free Shipping" "Next Day Air" ...
## $ Discount.Applied : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Promo.Code.Used : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Previous.Purchases : int 14 2 23 49 31 14 49 19 8 4 ...
## $ Pay.Method : chr "Venmo" "Cash" "Credit Card" "PayPal" ...
## $ Frequency.of.Purchases: chr "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
str(Q9)
## spc_tbl_ [3,900 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Customer.ID : num [1:3900] 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num [1:3900] 55 19 50 21 45 46 63 27 26 57 ...
## $ Gender : chr [1:3900] "Male" "Male" "Male" "Male" ...
## $ Item Purchased : chr [1:3900] "Blouse" "Sweater" "Jeans" "Sandals" ...
## $ Product.Category : chr [1:3900] "Clothing" "Clothing" "Clothing" "Footwear" ...
## $ Total.Revenue : num [1:3900] 53 64 73 90 49 20 85 34 97 31 ...
## $ Location : chr [1:3900] "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
## $ Size : chr [1:3900] "L" "L" "S" "M" ...
## $ Color : chr [1:3900] "Gray" "Maroon" "Maroon" "Maroon" ...
## $ Season : chr [1:3900] "Winter" "Winter" "Spring" "Spring" ...
## $ Review Rating : num [1:3900] 3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
## $ Subscription.Status : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
## $ Pay.Method : chr [1:3900] "Credit Card" "Bank Transfer" "Cash" "PayPal" ...
## $ Sales.Channel : chr [1:3900] "Express" "Express" "Free Shipping" "Next Day Air" ...
## $ Discount Applied : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
## $ Promo Code Used : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
## $ Previous Purchases : num [1:3900] 14 2 23 49 31 14 49 19 8 4 ...
## $ Preferred Payment Method: chr [1:3900] "Venmo" "Cash" "Credit Card" "PayPal" ...
## $ Frequency of Purchases : chr [1:3900] "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
## - attr(*, "spec")=
## .. cols(
## .. `Customer ID` = col_double(),
## .. Age = col_double(),
## .. Gender = col_character(),
## .. `Item Purchased` = col_character(),
## .. Category = col_character(),
## .. `Purchase Amount (USD)` = col_double(),
## .. Location = col_character(),
## .. Size = col_character(),
## .. Color = col_character(),
## .. Season = col_character(),
## .. `Review Rating` = col_double(),
## .. `Subscription Status` = col_character(),
## .. `Payment Method` = col_character(),
## .. `Shipping Type` = col_character(),
## .. `Discount Applied` = col_character(),
## .. `Promo Code Used` = col_character(),
## .. `Previous Purchases` = col_double(),
## .. `Preferred Payment Method` = col_character(),
## .. `Frequency of Purchases` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
table(Q1$Gender)
##
## 0 1
## 21624 44172
table(Q2$Gender)
##
## Female Male
## 1248 2652
table(Q9$Gender)
##
## Female Male
## 1248 2652
# Reassigning the gender in Q1 data from "0", "1", "2" to become "Male", "female" and "Other
Q1 <- Q1 %>%
mutate(Gender = case_when(
Gender == 0 ~ "Male",
Gender == 1 ~ "Female",
Gender== 2 ~ "Other",
TRUE ~ as.character(Gender)
))
# Rename values in Payment_method to Correspond to Orignal Data Description
Q1 <- Q1 %>%
mutate(Pay.Method = case_when(
Pay.Method == 0 ~ "Digital wallets",
Pay.Method == 1 ~ "Card",
Pay.Method == 2 ~ "Paypal",
Pay.Method == 3 ~ "Other",
TRUE ~ as.character(Pay.Method)
))
table(Q2$Sales.Channel) # Checking how many observations that falls under Sales.Channel
##
## 2-Day Shipping Express Free Shipping Next Day Air Standard
## 627 646 675 648 654
## Store Pickup
## 650
# Rename values in Customer.Type in Q2
Q2 <- Q2 %>%
mutate(Subscription.Status = recode(Subscription.Status
,"Yes" = "Member"
,"No" = "Normal"))
# Assigning each Observation according to their Sales Channel needed for analysis
Q2 <- Q2 %>%
mutate(Sales.Channel = recode(Sales.Channel
,"Day Shipping" = "Online"
,"Express" = "Online"
,"Free Shipping" = "Online"
,"Next Day Air" = "Online"
, "Standard " = "Online"
, "Store Pickup" = "Offline"))
Q2$Total.Revenue <- as.numeric(Q2$Total.Revenue) # Converting Total revenue in Q2 data from Integer to numeric
#Rename values in for the Subscription Status, Converting Age from numeric to integer and converting Customer. ID from numeric to integer in Q9 data
Q9 <- Q9 %>%
mutate(Subscription.Status = recode(Subscription.Status
,"Yes" = "Member"
,"No" = "Normal"))
Q9$Age <- as.integer(Q9$Age)
Q9$Customer.ID <- as.integer(Q9$Customer.ID)
# Changing all values in Sales Channel to be Online since the inial values showed "Time Spent On Web"
Q1$Sales.Channel <- "Online"
## Removing Unwanted Columns in each Dataset that does not fit the Customer Sales analysis
Q1 <- Q1 %>%
select(-c(N_Purchases, Purchase_DATE, Purchase_VALUE, Browser, Newsletter, Voucher, date, month, day, year, day_of_week))
Q2<- Q2 %>%
select(-c(Item.Purchased , Location, Size, Color, Review.Rating, Discount.Applied, Promo.Code.Used, Previous.Purchases, Frequency.of.Purchases))
Q9<- Q9 %>%
select(-c('Item Purchased', Location, Size, Color, 'Review Rating', 'Discount Applied', 'Promo Code Used', 'Previous Purchases', 'Frequency of Purchases', 'Preferred Payment Method'))
# Confirming the Structure of each data to ensure complete wrangling
str(Q1)
## 'data.frame': 65796 obs. of 8 variables:
## $ Customer.ID : int 504308 504309 504310 504311 504312 504313 504314 504315 504316 504317 ...
## $ Age : int 53 18 52 29 21 55 17 30 51 63 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ Total.Revenue : num 45.3 36.2 10.6 54.1 56.9 13.7 30.7 8.1 18 19.2 ...
## $ Pay.Method : chr "Card" "Paypal" "Digital wallets" "Card" ...
## $ Sales.Channel : chr "Online" "Online" "Online" "Online" ...
## $ Subscription.Status: chr "Normal" "Normal" "Member" "Member" ...
## $ Season : chr "Summer" "Winter" "Spring" "Autumn" ...
str(Q2)
## 'data.frame': 3900 obs. of 9 variables:
## $ Customer.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 55 19 50 21 45 46 63 27 26 57 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Product.Category : chr "Clothing" "Clothing" "Clothing" "Footwear" ...
## $ Total.Revenue : num 53 64 73 90 49 20 85 34 97 31 ...
## $ Season : chr "Winter" "Winter" "Spring" "Spring" ...
## $ Subscription.Status: chr "Member" "Member" "Member" "Member" ...
## $ Sales.Channel : chr "Online" "Online" "Online" "Online" ...
## $ Pay.Method : chr "Venmo" "Cash" "Credit Card" "PayPal" ...
str(Q9)
## tibble [3,900 × 9] (S3: tbl_df/tbl/data.frame)
## $ Customer.ID : int [1:3900] 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int [1:3900] 55 19 50 21 45 46 63 27 26 57 ...
## $ Gender : chr [1:3900] "Male" "Male" "Male" "Male" ...
## $ Product.Category : chr [1:3900] "Clothing" "Clothing" "Clothing" "Footwear" ...
## $ Total.Revenue : num [1:3900] 53 64 73 90 49 20 85 34 97 31 ...
## $ Season : chr [1:3900] "Winter" "Winter" "Spring" "Spring" ...
## $ Subscription.Status: chr [1:3900] "Member" "Member" "Member" "Member" ...
## $ Pay.Method : chr [1:3900] "Credit Card" "Bank Transfer" "Cash" "PayPal" ...
## $ Sales.Channel : chr [1:3900] "Express" "Express" "Free Shipping" "Next Day Air" ...
# Stacking all data frames into One big data frame
Sales.Data <- bind_rows(Q1, Q2, Q9)
# count the number of NA value in each column
colSums(is.na(Sales.Data))
## Customer.ID Age Gender Total.Revenue
## 0 0 0 0
## Pay.Method Sales.Channel Subscription.Status Season
## 0 0 0 0
## Product.Category
## 65796
# remove NA
Sales.Data <-
Sales.Data[rowSums(is.na(Sales.Data)) !=
ncol(Sales.Data), ]
# Removing duplicate
Sales.Data <- unique(Sales.Data)
table(Sales.Data$Gender)
##
## Female Male
## 46608 26788
Sales.Data <- Sales.Data %>% filter(!is.na(Product.Category))
str(Sales.Data) #Confirm Data structure
## 'data.frame': 7600 obs. of 9 variables:
## $ Customer.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 55 19 50 21 45 46 63 27 26 57 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Total.Revenue : num 53 64 73 90 49 20 85 34 97 31 ...
## $ Pay.Method : chr "Venmo" "Cash" "Credit Card" "PayPal" ...
## $ Sales.Channel : chr "Online" "Online" "Online" "Online" ...
## $ Subscription.Status: chr "Member" "Member" "Member" "Member" ...
## $ Season : chr "Winter" "Winter" "Spring" "Spring" ...
## $ Product.Category : chr "Clothing" "Clothing" "Clothing" "Footwear" ...
STEP 4: CONDUCTING DESCRIPTIVE ANALYSIS (FINDING KEYMETRICS)
# Total number of customers
total_customers <- nrow(Sales.Data)
# Retained customers (those with a "Member" status)
retained_customers <- nrow(Sales.Data[Sales.Data$Subscription.Status == "Member", ])
# Calculate retention rate
retention_rate <- (retained_customers / total_customers) * 100
retention_rate
## [1] 26.98684
# Calculate percentages
Subscription.status_count <- as.data.frame(table(Sales.Data$Subscription.Status))
Subscription.status_count$percent <- round(Subscription.status_count$Freq / sum(Subscription.status_count$Freq) * 100, 1)
# Plot the pie chart
pie(Subscription.status_count$Freq, labels = paste(Subscription.status_count$Var1, Subscription.status_count$percent, "%"),
main = "Customer Retention (Member vs Normal)",
col = c("lightblue", "lightgreen"))

# Compare the Total Revenue by Customer Gender according to their Subscription Status
aggregate(Sales.Data$Total.Revenue ~ Sales.Data$Subscription.Status + Sales.Data$Gender, FUN = mean)
## Sales.Data$Subscription.Status Sales.Data$Gender Sales.Data$Total.Revenue
## 1 Normal Female 60.25411
## 2 Member Male 59.53876
## 3 Normal Male 59.39640
aggregate(Sales.Data$Total.Revenue ~ Sales.Data$Subscription.Status + Sales.Data$Gender, FUN = median)
## Sales.Data$Subscription.Status Sales.Data$Gender Sales.Data$Total.Revenue
## 1 Normal Female 60
## 2 Member Male 60
## 3 Normal Male 59
aggregate(Sales.Data$Total.Revenue ~ Sales.Data$Subscription.Status + Sales.Data$Gender, FUN = max)
## Sales.Data$Subscription.Status Sales.Data$Gender Sales.Data$Total.Revenue
## 1 Normal Female 100
## 2 Member Male 100
## 3 Normal Male 100
aggregate(Sales.Data$Total.Revenue ~ Sales.Data$Subscription.Status+ Sales.Data$Gender, FUN = min)
## Sales.Data$Subscription.Status Sales.Data$Gender Sales.Data$Total.Revenue
## 1 Normal Female 20
## 2 Member Male 20
## 3 Normal Male 20
# Group by Customer.Gender and Subscription.Status, then sum the total revenue
Total.Revenue <- Sales.Data %>%
group_by(Gender, Subscription.Status) %>%
summarise(Total.Revenue = sum(Total.Revenue, na.rm = TRUE), .groups = "drop")
# Create a bar plot of Total Revenue by Customer Gender and Subscription Status
Total.Revenue <- Sales.Data %>%
group_by(Gender, Subscription.Status) %>%
summarise(Total.Revenue = sum(Total.Revenue, na.rm = TRUE), .groups = "drop")
ggplot(Total.Revenue , aes(x = Gender, y = Total.Revenue, fill = Subscription.Status)) +
geom_bar(stat = "identity", position = "dodge") + # Use "dodge" to place bars side by side
labs(title = "Total Revenue by Customer Gender and Subscription Status",
x = "Customer Gender",
y = "Total Revenue") +
theme_minimal()

# Compare the Product category gotten by customers for each season
aggregate(Sales.Data$Season ~ Sales.Data$Product.Category + Sales.Data$Subscription.Status, FUN = max)
## Sales.Data$Product.Category Sales.Data$Subscription.Status Sales.Data$Season
## 1 Accessories Member Winter
## 2 Clothing Member Winter
## 3 Footwear Member Winter
## 4 Outerwear Member Winter
## 5 Accessories Normal Winter
## 6 Clothing Normal Winter
## 7 Footwear Normal Winter
## 8 Outerwear Normal Winter
aggregate(Sales.Data$Season ~ Sales.Data$Product.Category + Sales.Data$Subscription.Status, FUN = min)
## Sales.Data$Product.Category Sales.Data$Subscription.Status Sales.Data$Season
## 1 Accessories Member Fall
## 2 Clothing Member Fall
## 3 Footwear Member Fall
## 4 Outerwear Member Fall
## 5 Accessories Normal Fall
## 6 Clothing Normal Fall
## 7 Footwear Normal Fall
## 8 Outerwear Normal Fall
# Group by Season, Subscription.Status, and Product.Category, and Total purchases
Seasonal.Purchases <- Sales.Data %>%
group_by(Season, Subscription.Status, Product.Category) %>%
summarise(Total.Purchase = n()) %>% # n() counts the number of rows in each group
arrange(Season, Subscription.Status, desc(Total.Purchase))
## `summarise()` has grouped output by 'Season', 'Subscription.Status'. You can
## override using the `.groups` argument.
# Create a bar plot for seasonal purchases
ggplot(Seasonal.Purchases, aes(x = Season, y = Total.Purchase, fill = Product.Category)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ Subscription.Status) +
labs(title = "Product Purchases by Season and Subscription Status",
x = "Season",
y = "Number of Purchases")

# Save cleaned data frame as a CSV file
write.csv(Sales.Data, file = "Cleaned_Sales_Data.csv", row.names = FALSE)