library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.1
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
data <- read_csv("large_sales_dataset.csv")
## Rows: 1000 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Product_Category, Purchase_Date, Customer_Gender, Store_Location
## dbl (4): Customer_ID, Purchase_Amount, Customer_Age, Satisfaction_Score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$Customer_Age <- cut(
data$Customer_Age,
breaks = c(0, 18, 25, 35, 45, 55, 65, 75, 85, 100),
labels = c('0-18', '19-25', '26-35', '36-45', '46-55', '56-65', '66-75', '76-85', '86-100'),
right = TRUE, # Includes the upper limit in the interval
include.lowest = TRUE # Includes 0 in the first bin
)
purchase_amount_table <- data %>%
group_by(Product_Category, Customer_Gender, Customer_Age) %>%
summarise(
AvgPurchaseAmount = mean(Purchase_Amount, na.rm = TRUE),
MedPurchaseAmount = median(Purchase_Amount, na.rm = TRUE),
.groups = "drop"
)
tail(purchase_amount_table, 15)
## # A tibble: 15 × 5
## Product_Category Customer_Gender Customer_Age AvgPurchaseAmount
## <chr> <chr> <fct> <dbl>
## 1 Home Appliances Male 46-55 1372.
## 2 Home Appliances Male 56-65 1109.
## 3 Home Appliances Male 66-75 895.
## 4 Toys Female 19-25 1013.
## 5 Toys Female 26-35 1027.
## 6 Toys Female 36-45 586.
## 7 Toys Female 46-55 1053.
## 8 Toys Female 56-65 948.
## 9 Toys Female 66-75 929.
## 10 Toys Male 0-18 1115.
## 11 Toys Male 19-25 1139.
## 12 Toys Male 26-35 1155.
## 13 Toys Male 36-45 932.
## 14 Toys Male 46-55 778.
## 15 Toys Male 56-65 1008.
## # ℹ 1 more variable: MedPurchaseAmount <dbl>
sum_purchases <- data %>%
group_by(Customer_Age) %>%
summarise(Total_Purchase = sum(Purchase_Amount, na.rm = TRUE)) %>%
arrange(desc(Total_Purchase))
sum_purchases
## # A tibble: 7 × 2
## Customer_Age Total_Purchase
## <fct> <dbl>
## 1 26-35 279965.
## 2 19-25 205891.
## 3 36-45 186527.
## 4 46-55 147993.
## 5 56-65 104559.
## 6 66-75 39057.
## 7 0-18 28610.
-Age group 26-35 has the highest total purchase amount at $279,964.80.
average_satisfaction <- data %>%
group_by(Product_Category) %>%
summarise(Average_Satisfaction = mean(Satisfaction_Score, na.rm = TRUE)) %>%
arrange(desc(Average_Satisfaction))
average_satisfaction
## # A tibble: 6 × 2
## Product_Category Average_Satisfaction
## <chr> <dbl>
## 1 Electronics 8.88
## 2 Home Appliances 7.96
## 3 Furniture 7.95
## 4 Books 7.07
## 5 Clothing 7.02
## 6 Toys 6.02
-Electronics have the highest average satisfaction score at 8.884321.
electronics_sales <- data %>%
filter(Product_Category == "Electronics")%>%
group_by(Store_Location) %>%
summarise(Total_Electronics_Sold = sum(Purchase_Amount, na.rm = TRUE)) %>%
arrange(desc(Total_Electronics_Sold))
electronics_sales
## # A tibble: 13 × 2
## Store_Location Total_Electronics_Sold
## <chr> <dbl>
## 1 Boston 21590.
## 2 Miami 21288.
## 3 San Diego 19033.
## 4 Dallas 17023.
## 5 Los Angeles 12961.
## 6 San Francisco 12445.
## 7 Atlanta 11530.
## 8 Houston 11287.
## 9 New York 11265.
## 10 Chicago 9790.
## 11 Las Vegas 9330.
## 12 Seattle 7408.
## 13 Denver 3168.
-The Location with the highest sales of Electronics is Boston at $21589.74.
electronics_counts <- data %>%
filter(Product_Category == "Electronics") %>%
count(Store_Location, name = "Electronics_Purchase_Count") %>%
arrange(desc(Electronics_Purchase_Count))
electronics_counts
## # A tibble: 13 × 2
## Store_Location Electronics_Purchase_Count
## <chr> <int>
## 1 Boston 23
## 2 Miami 21
## 3 San Diego 16
## 4 Chicago 15
## 5 Dallas 15
## 6 Atlanta 14
## 7 Houston 14
## 8 Los Angeles 12
## 9 New York 12
## 10 Las Vegas 11
## 11 San Francisco 9
## 12 Seattle 9
## 13 Denver 5
data2 <- data
data2$Purchase_Date <- as.Date(data$Purchase_Date, format = "%m/%d/%Y")
data2 <- data2 %>%
mutate(Month = floor_date(Purchase_Date, "month"))
monthly_avg_sales <- data2 %>%
group_by(Month, Product_Category) %>%
summarise(Average_Sales = mean(Purchase_Amount, na.rm = TRUE), .groups = "drop")
ggplot(monthly_avg_sales, aes(x = Month, y = Average_Sales, color = Product_Category)) +
geom_line(size = 1.5) +
labs(title = "Monthly Average Sales per Product Category",
x = "Month",
y = "Average Sales")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
rolling_avg_sales <- monthly_avg_sales %>%
arrange(Product_Category, Month) %>%
group_by(Product_Category) %>%
mutate(Rolling_Avg_3_Month = rollmean(Average_Sales, k = 3, fill = NA, align = "center"))
ggplot(rolling_avg_sales, aes(x = Month)) +
# Original monthly average sales (lighter line)
geom_line(aes(y = Average_Sales), color = "gray60", size = 1) +
# 3-month rolling average (highlighted line)
geom_line(aes(y = Rolling_Avg_3_Month), color = "steelblue", size = 1.5) +
facet_wrap(~ Product_Category, scales = "free_y") +
labs(title = "Monthly vs. 3-Month Rolling Average Sales by Product Category",
x = "Month",
y = "Sales") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
-3-month averages are in SteelBlue, monthly averages are in Gray60. ### (i) What difference do you observe? How are the rolling means beneficial as compared to simple averages?
-I can’t find trainData_Sales…