library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.1
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
data <- read_csv("large_sales_dataset.csv")
## Rows: 1000 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Product_Category, Purchase_Date, Customer_Gender, Store_Location
## dbl (4): Customer_ID, Purchase_Amount, Customer_Age, Satisfaction_Score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Question 1: Data Analysis (8 points)

1a)(4 points) Output a table that has both the average and median Purchase_Amount grouped by Product_Category, Customer_Gender and Customer_Age. Show the last 15 rows of the table

data$Customer_Age <- cut(
  data$Customer_Age,
  breaks = c(0, 18, 25, 35, 45, 55, 65, 75, 85, 100),
  labels = c('0-18', '19-25', '26-35', '36-45', '46-55', '56-65', '66-75', '76-85', '86-100'),
  right = TRUE,  # Includes the upper limit in the interval
  include.lowest = TRUE  # Includes 0 in the first bin
)
purchase_amount_table <- data %>%
  group_by(Product_Category, Customer_Gender, Customer_Age) %>%
  summarise(
    AvgPurchaseAmount = mean(Purchase_Amount, na.rm = TRUE),
    MedPurchaseAmount = median(Purchase_Amount, na.rm = TRUE),
    .groups = "drop"
  )
tail(purchase_amount_table, 15)
## # A tibble: 15 × 5
##    Product_Category Customer_Gender Customer_Age AvgPurchaseAmount
##    <chr>            <chr>           <fct>                    <dbl>
##  1 Home Appliances  Male            46-55                    1372.
##  2 Home Appliances  Male            56-65                    1109.
##  3 Home Appliances  Male            66-75                     895.
##  4 Toys             Female          19-25                    1013.
##  5 Toys             Female          26-35                    1027.
##  6 Toys             Female          36-45                     586.
##  7 Toys             Female          46-55                    1053.
##  8 Toys             Female          56-65                     948.
##  9 Toys             Female          66-75                     929.
## 10 Toys             Male            0-18                     1115.
## 11 Toys             Male            19-25                    1139.
## 12 Toys             Male            26-35                    1155.
## 13 Toys             Male            36-45                     932.
## 14 Toys             Male            46-55                     778.
## 15 Toys             Male            56-65                    1008.
## # ℹ 1 more variable: MedPurchaseAmount <dbl>

Which customer age group has the highest total purchase amount across all categories?

sum_purchases <- data %>%
  group_by(Customer_Age) %>%
  summarise(Total_Purchase = sum(Purchase_Amount, na.rm = TRUE)) %>%
  arrange(desc(Total_Purchase))
sum_purchases
## # A tibble: 7 × 2
##   Customer_Age Total_Purchase
##   <fct>                 <dbl>
## 1 26-35               279965.
## 2 19-25               205891.
## 3 36-45               186527.
## 4 46-55               147993.
## 5 56-65               104559.
## 6 66-75                39057.
## 7 0-18                 28610.

-Age group 26-35 has the highest total purchase amount at $279,964.80.

Which product category has the highest average satisfaction score, and in which location is it most frequently purchased?

average_satisfaction <- data %>%
  group_by(Product_Category) %>%
  summarise(Average_Satisfaction = mean(Satisfaction_Score, na.rm = TRUE)) %>%
  arrange(desc(Average_Satisfaction))
average_satisfaction
## # A tibble: 6 × 2
##   Product_Category Average_Satisfaction
##   <chr>                           <dbl>
## 1 Electronics                      8.88
## 2 Home Appliances                  7.96
## 3 Furniture                        7.95
## 4 Books                            7.07
## 5 Clothing                         7.02
## 6 Toys                             6.02

-Electronics have the highest average satisfaction score at 8.884321.

electronics_sales <- data %>%
  filter(Product_Category == "Electronics")%>%
  group_by(Store_Location) %>%
  summarise(Total_Electronics_Sold = sum(Purchase_Amount, na.rm = TRUE)) %>%
  arrange(desc(Total_Electronics_Sold))
electronics_sales
## # A tibble: 13 × 2
##    Store_Location Total_Electronics_Sold
##    <chr>                           <dbl>
##  1 Boston                         21590.
##  2 Miami                          21288.
##  3 San Diego                      19033.
##  4 Dallas                         17023.
##  5 Los Angeles                    12961.
##  6 San Francisco                  12445.
##  7 Atlanta                        11530.
##  8 Houston                        11287.
##  9 New York                       11265.
## 10 Chicago                         9790.
## 11 Las Vegas                       9330.
## 12 Seattle                         7408.
## 13 Denver                          3168.

-The Location with the highest sales of Electronics is Boston at $21589.74.

electronics_counts <- data %>%
  filter(Product_Category == "Electronics") %>%
  count(Store_Location, name = "Electronics_Purchase_Count") %>%  
  arrange(desc(Electronics_Purchase_Count))
electronics_counts
## # A tibble: 13 × 2
##    Store_Location Electronics_Purchase_Count
##    <chr>                               <int>
##  1 Boston                                 23
##  2 Miami                                  21
##  3 San Diego                              16
##  4 Chicago                                15
##  5 Dallas                                 15
##  6 Atlanta                                14
##  7 Houston                                14
##  8 Los Angeles                            12
##  9 New York                               12
## 10 Las Vegas                              11
## 11 San Francisco                           9
## 12 Seattle                                 9
## 13 Denver                                  5
  • Boston also has the highest total number of purchases involving electronics at 23.

Question 2: Time Series Analysis

Calculate the monthly average sales per product category and plot them.

data2 <- data
data2$Purchase_Date <- as.Date(data$Purchase_Date, format = "%m/%d/%Y")
data2 <- data2 %>%
  mutate(Month = floor_date(Purchase_Date, "month"))
monthly_avg_sales <- data2 %>%
  group_by(Month, Product_Category) %>%
  summarise(Average_Sales = mean(Purchase_Amount, na.rm = TRUE), .groups = "drop")
ggplot(monthly_avg_sales, aes(x = Month, y = Average_Sales, color = Product_Category)) +
  geom_line(size = 1.5) +
  labs(title = "Monthly Average Sales per Product Category",
       x = "Month",
       y = "Average Sales")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Calculate the rolling means of average monthly sales (calculated in 2a) using a 3 month window. Use the center alignment to calculate the rolling means.

rolling_avg_sales <- monthly_avg_sales %>%
  arrange(Product_Category, Month) %>%
  group_by(Product_Category) %>%
  mutate(Rolling_Avg_3_Month = rollmean(Average_Sales, k = 3, fill = NA, align = "center"))
ggplot(rolling_avg_sales, aes(x = Month)) +
  # Original monthly average sales (lighter line)
  geom_line(aes(y = Average_Sales), color = "gray60", size = 1) +
  
  # 3-month rolling average (highlighted line)
  geom_line(aes(y = Rolling_Avg_3_Month), color = "steelblue", size = 1.5) +
  
  facet_wrap(~ Product_Category, scales = "free_y") +
  labs(title = "Monthly vs. 3-Month Rolling Average Sales by Product Category",
       x = "Month",
       y = "Sales") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

-3-month averages are in SteelBlue, monthly averages are in Gray60. ### (i) What difference do you observe? How are the rolling means beneficial as compared to simple averages?

  • The differences I observe is that the 3-month rolling averages are much smoother in comparison to the monthly average sales of each product. They are much less reactive, less noisy, and are more indicative and, one could say predictive, for the following months sales. They would form a decent starting model in predicting the following months sales.

Question 3: Data Exploration

Create a boxplot of the response variable versus the following predicting variables:

(i) Store_Location

-I can’t find trainData_Sales…