STEP 1: INSTALL PACKAGES

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readr)
library(lubridate)
library(dplyr)

STEP 2:DATA COLLECTION

# # Upload Customer Sales Data (csv files) here
Q1 <- read.csv("C:/Users/USER-PC/Downloads/Online Shop Customer Sales Data (8).csv")
Q2 <- read.csv("C:/Users/USER-PC/Downloads/shopping_behavior_updated (1).csv")
Q9 <- read_csv("C:/Users/USER-PC/Downloads/shopping_trends.csv")
## Rows: 3900 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): Gender, Item Purchased, Category, Location, Size, Color, Season, S...
## dbl  (5): Customer ID, Age, Purchase Amount (USD), Review Rating, Previous P...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

STEP 3: WRANGLING DATA, DATA CLEANING AND COMBINING INTO A SINGLE FILE

# Comparing file structure and column names of each files, to ensure they match perfectly
colnames(Q1)
##  [1] "Customer_id"    "Age"            "Gender"         "Revenue_Total" 
##  [5] "N_Purchases"    "Purchase_DATE"  "Purchase_VALUE" "Pay_Method"    
##  [9] "Time_Spent"     "Browser"        "Newsletter"     "Voucher"
colnames(Q2)
##  [1] "Customer.ID"            "Age"                    "Gender"                
##  [4] "Item.Purchased"         "Category"               "Purchase.Amount..USD." 
##  [7] "Location"               "Size"                   "Color"                 
## [10] "Season"                 "Review.Rating"          "Subscription.Status"   
## [13] "Shipping.Type"          "Discount.Applied"       "Promo.Code.Used"       
## [16] "Previous.Purchases"     "Payment.Method"         "Frequency.of.Purchases"
colnames(Q9)
##  [1] "Customer ID"              "Age"                     
##  [3] "Gender"                   "Item Purchased"          
##  [5] "Category"                 "Purchase Amount (USD)"   
##  [7] "Location"                 "Size"                    
##  [9] "Color"                    "Season"                  
## [11] "Review Rating"            "Subscription Status"     
## [13] "Payment Method"           "Shipping Type"           
## [15] "Discount Applied"         "Promo Code Used"         
## [17] "Previous Purchases"       "Preferred Payment Method"
## [19] "Frequency of Purchases"
str(Q1)
## 'data.frame':    65796 obs. of  12 variables:
##  $ Customer_id   : int  504308 504309 504310 504311 504312 504313 504314 504315 504316 504317 ...
##  $ Age           : int  53 18 52 29 21 55 17 30 51 63 ...
##  $ Gender        : int  0 1 1 0 1 0 1 1 0 1 ...
##  $ Revenue_Total : num  45.3 36.2 10.6 54.1 56.9 13.7 30.7 8.1 18 19.2 ...
##  $ N_Purchases   : int  2 3 1 5 1 6 6 7 4 4 ...
##  $ Purchase_DATE : chr  "22.06.21" "10.12.21" "14.03.21" "25.10.21" ...
##  $ Purchase_VALUE: num  24.9 2.9 10.6 43.3 56.9 ...
##  $ Pay_Method    : int  1 2 0 1 1 1 0 3 0 3 ...
##  $ Time_Spent    : int  885 656 761 906 605 364 654 1011 312 828 ...
##  $ Browser       : int  0 0 0 0 0 1 0 0 3 0 ...
##  $ Newsletter    : int  0 0 1 1 1 0 0 0 1 0 ...
##  $ Voucher       : int  0 1 0 0 0 0 0 0 0 0 ...
str(Q2)
## 'data.frame':    3900 obs. of  18 variables:
##  $ Customer.ID           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                   : int  55 19 50 21 45 46 63 27 26 57 ...
##  $ Gender                : chr  "Male" "Male" "Male" "Male" ...
##  $ Item.Purchased        : chr  "Blouse" "Sweater" "Jeans" "Sandals" ...
##  $ Category              : chr  "Clothing" "Clothing" "Clothing" "Footwear" ...
##  $ Purchase.Amount..USD. : int  53 64 73 90 49 20 85 34 97 31 ...
##  $ Location              : chr  "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
##  $ Size                  : chr  "L" "L" "S" "M" ...
##  $ Color                 : chr  "Gray" "Maroon" "Maroon" "Maroon" ...
##  $ Season                : chr  "Winter" "Winter" "Spring" "Spring" ...
##  $ Review.Rating         : num  3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
##  $ Subscription.Status   : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Shipping.Type         : chr  "Express" "Express" "Free Shipping" "Next Day Air" ...
##  $ Discount.Applied      : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Promo.Code.Used       : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Previous.Purchases    : int  14 2 23 49 31 14 49 19 8 4 ...
##  $ Payment.Method        : chr  "Venmo" "Cash" "Credit Card" "PayPal" ...
##  $ Frequency.of.Purchases: chr  "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
str(Q9)
## spc_tbl_ [3,900 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Customer ID             : num [1:3900] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : num [1:3900] 55 19 50 21 45 46 63 27 26 57 ...
##  $ Gender                  : chr [1:3900] "Male" "Male" "Male" "Male" ...
##  $ Item Purchased          : chr [1:3900] "Blouse" "Sweater" "Jeans" "Sandals" ...
##  $ Category                : chr [1:3900] "Clothing" "Clothing" "Clothing" "Footwear" ...
##  $ Purchase Amount (USD)   : num [1:3900] 53 64 73 90 49 20 85 34 97 31 ...
##  $ Location                : chr [1:3900] "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
##  $ Size                    : chr [1:3900] "L" "L" "S" "M" ...
##  $ Color                   : chr [1:3900] "Gray" "Maroon" "Maroon" "Maroon" ...
##  $ Season                  : chr [1:3900] "Winter" "Winter" "Spring" "Spring" ...
##  $ Review Rating           : num [1:3900] 3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
##  $ Subscription Status     : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
##  $ Payment Method          : chr [1:3900] "Credit Card" "Bank Transfer" "Cash" "PayPal" ...
##  $ Shipping Type           : chr [1:3900] "Express" "Express" "Free Shipping" "Next Day Air" ...
##  $ Discount Applied        : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
##  $ Promo Code Used         : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
##  $ Previous Purchases      : num [1:3900] 14 2 23 49 31 14 49 19 8 4 ...
##  $ Preferred Payment Method: chr [1:3900] "Venmo" "Cash" "Credit Card" "PayPal" ...
##  $ Frequency of Purchases  : chr [1:3900] "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Customer ID` = col_double(),
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   `Item Purchased` = col_character(),
##   ..   Category = col_character(),
##   ..   `Purchase Amount (USD)` = col_double(),
##   ..   Location = col_character(),
##   ..   Size = col_character(),
##   ..   Color = col_character(),
##   ..   Season = col_character(),
##   ..   `Review Rating` = col_double(),
##   ..   `Subscription Status` = col_character(),
##   ..   `Payment Method` = col_character(),
##   ..   `Shipping Type` = col_character(),
##   ..   `Discount Applied` = col_character(),
##   ..   `Promo Code Used` = col_character(),
##   ..   `Previous Purchases` = col_double(),
##   ..   `Preferred Payment Method` = col_character(),
##   ..   `Frequency of Purchases` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Reassigning 0 to "Not Subscribed" and 1 to "Subscribed" as initiated in the raw dataframe
Q1$Newsletter <- ifelse(Q1$Newsletter == 1, "Subscribed", "Not Subscribed")
# Reassigning "Not Subscribed" to "Normal" and "Subscribed" to "Member"
  Q1$Subscription.status <- ifelse(Q1$Newsletter %in% c("Not Subscribed"), "Normal",
                     ifelse(Q1$Newsletter %in% c("Subscribed"), "Member",
                     NA))
table(Q1$Subscription.status) # Checking the tables for Subscription.Satus to ensure it is changed
## 
## Member Normal 
##   9937  55859
#Converting the Date from Chr to DATE format
Q1$Purchase_DATE <- dmy(Q1$Purchase_DATE)
Q1$date <- as.Date(Q1$Purchase_DATE) #The default format is yyyy-mm-dd
Q1$month <- format(as.Date(Q1$Purchase_DATE), "%m")
Q1$day <- format(as.Date(Q1$Purchase_DATE), "%d")
Q1$year <- format(as.Date(Q1$Purchase_DATE), "%Y")
Q1$day_of_week <- format(as.Date(Q1$Purchase_DATE), "%A")
Q1$month <- as.numeric(Q1$month)   # Converting "month" to numeric

Q1$month <- month.name[Q1$month]   #Converting the month(1,2,3...) to be month(January, Febuary,  March...)
# Reassign "month" names as seasons
Q1$Season <- ifelse(Q1$month %in% c("December", "January", "February"), "Winter",
                    ifelse(Q1$month %in% c("March", "April", "May"), "Spring",
                    ifelse(Q1$month %in% c("June", "July", "August"), "Summer",
                    ifelse(Q1$month %in% c("September", "October", "November"), "Autumn", NA))))

Renaming Each file’s column to ensure it is consistent with each other

 ## Renaming for Q1 Data
Q1 <- rename(Q1,
  Customer.ID = Customer_id,
  Age = Age,
  Gender = Gender,
  Total.Revenue = Revenue_Total,
  Season = Season,
  Sales.Channel = Time_Spent,
  Subscription.Status = Subscription.status,
  Pay.Method = Pay_Method,
  Product.Category = NULL
)

# Renaming for Q2 Data
Q2 <- rename(Q2,
  Customer.ID = Customer.ID,
  Age = Age,
  Gender = Gender,
  Total.Revenue = Purchase.Amount..USD.,
  Season = Season,
  Sales.Channel = Shipping.Type,
  Subscription.Status = Subscription.Status,
  Pay.Method = Payment.Method,
  Product.Category = Category
)

# Renaming for Q9 Data
Q9 <- rename(Q9,
  Customer.ID = 'Customer ID',
  Age = Age,
  Gender = Gender,
  Total.Revenue = 'Purchase Amount (USD)',
  Season = Season,
  Sales.Channel = 'Shipping Type',
  Subscription.Status = 'Subscription Status',
  Pay.Method = 'Payment Method',
  Product.Category = Category
)
#checking the structure to confirm
str(Q1)
## 'data.frame':    65796 obs. of  19 variables:
##  $ Customer.ID        : int  504308 504309 504310 504311 504312 504313 504314 504315 504316 504317 ...
##  $ Age                : int  53 18 52 29 21 55 17 30 51 63 ...
##  $ Gender             : int  0 1 1 0 1 0 1 1 0 1 ...
##  $ Total.Revenue      : num  45.3 36.2 10.6 54.1 56.9 13.7 30.7 8.1 18 19.2 ...
##  $ N_Purchases        : int  2 3 1 5 1 6 6 7 4 4 ...
##  $ Purchase_DATE      : Date, format: "2021-06-22" "2021-12-10" ...
##  $ Purchase_VALUE     : num  24.9 2.9 10.6 43.3 56.9 ...
##  $ Pay.Method         : int  1 2 0 1 1 1 0 3 0 3 ...
##  $ Sales.Channel      : int  885 656 761 906 605 364 654 1011 312 828 ...
##  $ Browser            : int  0 0 0 0 0 1 0 0 3 0 ...
##  $ Newsletter         : chr  "Not Subscribed" "Not Subscribed" "Subscribed" "Subscribed" ...
##  $ Voucher            : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Subscription.Status: chr  "Normal" "Normal" "Member" "Member" ...
##  $ date               : Date, format: "2021-06-22" "2021-12-10" ...
##  $ month              : chr  "June" "December" "March" "October" ...
##  $ day                : chr  "22" "10" "14" "25" ...
##  $ year               : chr  "2021" "2021" "2021" "2021" ...
##  $ day_of_week        : chr  "Tuesday" "Friday" "Sunday" "Monday" ...
##  $ Season             : chr  "Summer" "Winter" "Spring" "Autumn" ...
str(Q2)
## 'data.frame':    3900 obs. of  18 variables:
##  $ Customer.ID           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                   : int  55 19 50 21 45 46 63 27 26 57 ...
##  $ Gender                : chr  "Male" "Male" "Male" "Male" ...
##  $ Item.Purchased        : chr  "Blouse" "Sweater" "Jeans" "Sandals" ...
##  $ Product.Category      : chr  "Clothing" "Clothing" "Clothing" "Footwear" ...
##  $ Total.Revenue         : int  53 64 73 90 49 20 85 34 97 31 ...
##  $ Location              : chr  "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
##  $ Size                  : chr  "L" "L" "S" "M" ...
##  $ Color                 : chr  "Gray" "Maroon" "Maroon" "Maroon" ...
##  $ Season                : chr  "Winter" "Winter" "Spring" "Spring" ...
##  $ Review.Rating         : num  3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
##  $ Subscription.Status   : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Sales.Channel         : chr  "Express" "Express" "Free Shipping" "Next Day Air" ...
##  $ Discount.Applied      : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Promo.Code.Used       : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Previous.Purchases    : int  14 2 23 49 31 14 49 19 8 4 ...
##  $ Pay.Method            : chr  "Venmo" "Cash" "Credit Card" "PayPal" ...
##  $ Frequency.of.Purchases: chr  "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
str(Q9)
## spc_tbl_ [3,900 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Customer.ID             : num [1:3900] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : num [1:3900] 55 19 50 21 45 46 63 27 26 57 ...
##  $ Gender                  : chr [1:3900] "Male" "Male" "Male" "Male" ...
##  $ Item Purchased          : chr [1:3900] "Blouse" "Sweater" "Jeans" "Sandals" ...
##  $ Product.Category        : chr [1:3900] "Clothing" "Clothing" "Clothing" "Footwear" ...
##  $ Total.Revenue           : num [1:3900] 53 64 73 90 49 20 85 34 97 31 ...
##  $ Location                : chr [1:3900] "Kentucky" "Maine" "Massachusetts" "Rhode Island" ...
##  $ Size                    : chr [1:3900] "L" "L" "S" "M" ...
##  $ Color                   : chr [1:3900] "Gray" "Maroon" "Maroon" "Maroon" ...
##  $ Season                  : chr [1:3900] "Winter" "Winter" "Spring" "Spring" ...
##  $ Review Rating           : num [1:3900] 3.1 3.1 3.1 3.5 2.7 2.9 3.2 3.2 2.6 4.8 ...
##  $ Subscription.Status     : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
##  $ Pay.Method              : chr [1:3900] "Credit Card" "Bank Transfer" "Cash" "PayPal" ...
##  $ Sales.Channel           : chr [1:3900] "Express" "Express" "Free Shipping" "Next Day Air" ...
##  $ Discount Applied        : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
##  $ Promo Code Used         : chr [1:3900] "Yes" "Yes" "Yes" "Yes" ...
##  $ Previous Purchases      : num [1:3900] 14 2 23 49 31 14 49 19 8 4 ...
##  $ Preferred Payment Method: chr [1:3900] "Venmo" "Cash" "Credit Card" "PayPal" ...
##  $ Frequency of Purchases  : chr [1:3900] "Fortnightly" "Fortnightly" "Weekly" "Weekly" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Customer ID` = col_double(),
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   `Item Purchased` = col_character(),
##   ..   Category = col_character(),
##   ..   `Purchase Amount (USD)` = col_double(),
##   ..   Location = col_character(),
##   ..   Size = col_character(),
##   ..   Color = col_character(),
##   ..   Season = col_character(),
##   ..   `Review Rating` = col_double(),
##   ..   `Subscription Status` = col_character(),
##   ..   `Payment Method` = col_character(),
##   ..   `Shipping Type` = col_character(),
##   ..   `Discount Applied` = col_character(),
##   ..   `Promo Code Used` = col_character(),
##   ..   `Previous Purchases` = col_double(),
##   ..   `Preferred Payment Method` = col_character(),
##   ..   `Frequency of Purchases` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
table(Q1$Gender)
## 
##     0     1 
## 21624 44172
table(Q2$Gender)
## 
## Female   Male 
##   1248   2652
table(Q9$Gender)
## 
## Female   Male 
##   1248   2652
 # Reassigning the gender in Q1 data from "0", "1", "2" to become "Male", "female" and "Other
Q1 <- Q1 %>%
  mutate(Gender = case_when(
    Gender == 0 ~ "Male",
    Gender == 1 ~ "Female",
   Gender== 2 ~ "Other",
    TRUE ~ as.character(Gender)
  ))

# Rename values in Payment_method to Correspond to Orignal Data Description
Q1 <- Q1 %>%
  mutate(Pay.Method = case_when(
    Pay.Method == 0 ~ "Digital wallets",
    Pay.Method == 1 ~ "Card",
    Pay.Method == 2 ~ "Paypal",
    Pay.Method == 3 ~ "Other",
    TRUE ~ as.character(Pay.Method)
  ))
table(Q2$Sales.Channel)  # Checking how many observations that falls under Sales.Channel
## 
## 2-Day Shipping        Express  Free Shipping   Next Day Air       Standard 
##            627            646            675            648            654 
##   Store Pickup 
##            650
# Rename values in Customer.Type in Q2
  Q2 <- Q2 %>%
  mutate(Subscription.Status = recode(Subscription.Status
                                ,"Yes" = "Member"
                                ,"No" = "Normal"))
  
 
  
  
  # Assigning each Observation according to their Sales Channel needed for analysis
  Q2 <- Q2 %>%
  mutate(Sales.Channel = recode(Sales.Channel
                                ,"Day Shipping" = "Online"
                                ,"Express" = "Online"
                                ,"Free Shipping" = "Online"
                                ,"Next Day Air" = "Online"
                                , "Standard " = "Online"
                                , "Store Pickup" = "Offline"))
  


  
Q2$Total.Revenue  <- as.numeric(Q2$Total.Revenue) # Converting Total revenue in Q2 data from Integer to numeric
 #Rename values in for the Subscription Status, Converting Age from numeric to integer and converting Customer. ID from numeric to integer in Q9 data
  Q9 <- Q9 %>%
  mutate(Subscription.Status = recode(Subscription.Status
                                ,"Yes" = "Member"
                                ,"No" = "Normal"))


Q9$Age  <- as.integer(Q9$Age)
Q9$Customer.ID  <- as.integer(Q9$Customer.ID)
# Changing all values in Sales Channel to be Online since the inial values showed "Time Spent On Web"
Q1$Sales.Channel <- "Online"
## Removing Unwanted Columns in each Dataset that does not fit the Customer Sales analysis
Q1 <- Q1 %>%
select(-c(N_Purchases, Purchase_DATE, Purchase_VALUE, Browser, Newsletter,  Voucher, date, month, day, year, day_of_week))


Q2<- Q2 %>%
select(-c(Item.Purchased  , Location, Size, Color, Review.Rating, Discount.Applied, Promo.Code.Used, Previous.Purchases, Frequency.of.Purchases))


Q9<- Q9 %>%
select(-c('Item Purchased', Location, Size, Color, 'Review Rating', 'Discount Applied', 'Promo Code Used', 'Previous Purchases', 'Frequency of Purchases', 'Preferred Payment Method'))
# Confirming the Structure of each data to ensure complete wrangling
str(Q1)
## 'data.frame':    65796 obs. of  8 variables:
##  $ Customer.ID        : int  504308 504309 504310 504311 504312 504313 504314 504315 504316 504317 ...
##  $ Age                : int  53 18 52 29 21 55 17 30 51 63 ...
##  $ Gender             : chr  "Male" "Female" "Female" "Male" ...
##  $ Total.Revenue      : num  45.3 36.2 10.6 54.1 56.9 13.7 30.7 8.1 18 19.2 ...
##  $ Pay.Method         : chr  "Card" "Paypal" "Digital wallets" "Card" ...
##  $ Sales.Channel      : chr  "Online" "Online" "Online" "Online" ...
##  $ Subscription.Status: chr  "Normal" "Normal" "Member" "Member" ...
##  $ Season             : chr  "Summer" "Winter" "Spring" "Autumn" ...
str(Q2)
## 'data.frame':    3900 obs. of  9 variables:
##  $ Customer.ID        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                : int  55 19 50 21 45 46 63 27 26 57 ...
##  $ Gender             : chr  "Male" "Male" "Male" "Male" ...
##  $ Product.Category   : chr  "Clothing" "Clothing" "Clothing" "Footwear" ...
##  $ Total.Revenue      : num  53 64 73 90 49 20 85 34 97 31 ...
##  $ Season             : chr  "Winter" "Winter" "Spring" "Spring" ...
##  $ Subscription.Status: chr  "Member" "Member" "Member" "Member" ...
##  $ Sales.Channel      : chr  "Online" "Online" "Online" "Online" ...
##  $ Pay.Method         : chr  "Venmo" "Cash" "Credit Card" "PayPal" ...
str(Q9)
## tibble [3,900 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Customer.ID        : int [1:3900] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                : int [1:3900] 55 19 50 21 45 46 63 27 26 57 ...
##  $ Gender             : chr [1:3900] "Male" "Male" "Male" "Male" ...
##  $ Product.Category   : chr [1:3900] "Clothing" "Clothing" "Clothing" "Footwear" ...
##  $ Total.Revenue      : num [1:3900] 53 64 73 90 49 20 85 34 97 31 ...
##  $ Season             : chr [1:3900] "Winter" "Winter" "Spring" "Spring" ...
##  $ Subscription.Status: chr [1:3900] "Member" "Member" "Member" "Member" ...
##  $ Pay.Method         : chr [1:3900] "Credit Card" "Bank Transfer" "Cash" "PayPal" ...
##  $ Sales.Channel      : chr [1:3900] "Express" "Express" "Free Shipping" "Next Day Air" ...
# Stacking all data frames into One big data frame
Sales.Data <- bind_rows(Q1, Q2, Q9)
# count the number of NA value in each column
colSums(is.na(Sales.Data))
##         Customer.ID                 Age              Gender       Total.Revenue 
##                   0                   0                   0                   0 
##          Pay.Method       Sales.Channel Subscription.Status              Season 
##                   0                   0                   0                   0 
##    Product.Category 
##               65796
# remove NA
Sales.Data <-
  Sales.Data[rowSums(is.na(Sales.Data)) !=
             ncol(Sales.Data), ]
# Removing duplicate
Sales.Data <- unique(Sales.Data)
table(Sales.Data$Gender)
## 
## Female   Male 
##  46608  26788
Sales.Data <- Sales.Data %>% filter(!is.na(Product.Category))
str(Sales.Data)  #Confirm Data structure
## 'data.frame':    7600 obs. of  9 variables:
##  $ Customer.ID        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                : int  55 19 50 21 45 46 63 27 26 57 ...
##  $ Gender             : chr  "Male" "Male" "Male" "Male" ...
##  $ Total.Revenue      : num  53 64 73 90 49 20 85 34 97 31 ...
##  $ Pay.Method         : chr  "Venmo" "Cash" "Credit Card" "PayPal" ...
##  $ Sales.Channel      : chr  "Online" "Online" "Online" "Online" ...
##  $ Subscription.Status: chr  "Member" "Member" "Member" "Member" ...
##  $ Season             : chr  "Winter" "Winter" "Spring" "Spring" ...
##  $ Product.Category   : chr  "Clothing" "Clothing" "Clothing" "Footwear" ...

STEP 4: CONDUCTING DESCRIPTIVE ANALYSIS (FINDING KEYMETRICS)

# Total number of customers
total_customers <- nrow(Sales.Data)

# Retained customers (those with a "Member" status)
retained_customers <- nrow(Sales.Data[Sales.Data$Subscription.Status == "Member", ])

# Calculate retention rate
retention_rate <- (retained_customers / total_customers) * 100

retention_rate
## [1] 26.98684
# Calculate percentages
Subscription.status_count <- as.data.frame(table(Sales.Data$Subscription.Status))
Subscription.status_count$percent <- round(Subscription.status_count$Freq / sum(Subscription.status_count$Freq) * 100, 1)

# Plot the pie chart
pie(Subscription.status_count$Freq, labels = paste(Subscription.status_count$Var1, Subscription.status_count$percent, "%"), 
    main = "Customer Retention (Member vs Normal)",
    col = c("lightblue", "lightgreen"))

# Compare the Total Revenue by Customer Gender according to their Subscription Status
aggregate(Sales.Data$Total.Revenue  ~ Sales.Data$Subscription.Status + Sales.Data$Gender, FUN = mean)
##   Sales.Data$Subscription.Status Sales.Data$Gender Sales.Data$Total.Revenue
## 1                         Normal            Female                 60.25411
## 2                         Member              Male                 59.53876
## 3                         Normal              Male                 59.39640
aggregate(Sales.Data$Total.Revenue  ~ Sales.Data$Subscription.Status + Sales.Data$Gender, FUN = median)
##   Sales.Data$Subscription.Status Sales.Data$Gender Sales.Data$Total.Revenue
## 1                         Normal            Female                       60
## 2                         Member              Male                       60
## 3                         Normal              Male                       59
aggregate(Sales.Data$Total.Revenue  ~ Sales.Data$Subscription.Status + Sales.Data$Gender, FUN = max)
##   Sales.Data$Subscription.Status Sales.Data$Gender Sales.Data$Total.Revenue
## 1                         Normal            Female                      100
## 2                         Member              Male                      100
## 3                         Normal              Male                      100
aggregate(Sales.Data$Total.Revenue  ~ Sales.Data$Subscription.Status+ Sales.Data$Gender, FUN = min)
##   Sales.Data$Subscription.Status Sales.Data$Gender Sales.Data$Total.Revenue
## 1                         Normal            Female                       20
## 2                         Member              Male                       20
## 3                         Normal              Male                       20
# Group by Customer.Gender and Subscription.Status, then sum the total revenue
Total.Revenue <- Sales.Data %>%
  group_by(Gender, Subscription.Status) %>%
  summarise(Total.Revenue = sum(Total.Revenue, na.rm = TRUE), .groups = "drop")
# Create a bar plot of Total Revenue by Customer Gender and Subscription Status
Total.Revenue <- Sales.Data %>%
  group_by(Gender, Subscription.Status) %>%
  summarise(Total.Revenue = sum(Total.Revenue, na.rm = TRUE), .groups = "drop")
ggplot(Total.Revenue , aes(x = Gender, y = Total.Revenue, fill = Subscription.Status)) +
  geom_bar(stat = "identity", position = "dodge") +  # Use "dodge" to place bars side by side
  labs(title = "Total Revenue by Customer Gender and Subscription Status",
       x = "Customer Gender",
       y = "Total Revenue") +
  theme_minimal()

# Compare the Product category gotten by customers for each season
aggregate(Sales.Data$Season  ~ Sales.Data$Product.Category + Sales.Data$Subscription.Status, FUN = max)
##   Sales.Data$Product.Category Sales.Data$Subscription.Status Sales.Data$Season
## 1                 Accessories                         Member            Winter
## 2                    Clothing                         Member            Winter
## 3                    Footwear                         Member            Winter
## 4                   Outerwear                         Member            Winter
## 5                 Accessories                         Normal            Winter
## 6                    Clothing                         Normal            Winter
## 7                    Footwear                         Normal            Winter
## 8                   Outerwear                         Normal            Winter
aggregate(Sales.Data$Season  ~ Sales.Data$Product.Category + Sales.Data$Subscription.Status, FUN = min)
##   Sales.Data$Product.Category Sales.Data$Subscription.Status Sales.Data$Season
## 1                 Accessories                         Member              Fall
## 2                    Clothing                         Member              Fall
## 3                    Footwear                         Member              Fall
## 4                   Outerwear                         Member              Fall
## 5                 Accessories                         Normal              Fall
## 6                    Clothing                         Normal              Fall
## 7                    Footwear                         Normal              Fall
## 8                   Outerwear                         Normal              Fall
# Group by Season, Subscription.Status, and Product.Category, and Total purchases
Seasonal.Purchases <- Sales.Data %>%
  group_by(Season, Subscription.Status, Product.Category) %>%
  summarise(Total.Purchase = n()) %>%  # n() counts the number of rows in each group
  arrange(Season, Subscription.Status, desc(Total.Purchase))
## `summarise()` has grouped output by 'Season', 'Subscription.Status'. You can
## override using the `.groups` argument.
# Create a bar plot for seasonal purchases
ggplot(Seasonal.Purchases, aes(x = Season, y = Total.Purchase, fill = Product.Category)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Subscription.Status) +
  labs(title = "Product Purchases by Season and Subscription Status",
       x = "Season",
       y = "Number of Purchases")

# Save cleaned data frame as a CSV file
write.csv(Sales.Data, file = "Cleaned_Sales_Data.csv", row.names = FALSE)