Notebook created by Leslie A. McFarlin, 26 Feb 2022
# Libraries - installations and loading done via Packges pane.
# Uncomment the line below to load the libraries
install.packages('caTools', repos = "https://cran.rstudio.org")
library(readxl, plyr, dplyr, ggplot2, tibble, tidyverse, gt, corrplot, caTools, MLmetrics, knitr)
# Import data set, create data frame, view data
walmart <- data.frame(read.csv("Walmart_Store_sales.csv"), stringsAsFactors = T)
View(walmart)
There are 5 basic tasks to perform using this data set: - Identify the store with maximum sales. - Identify the store with the greatest standard deviation, and its coefficient of mean to standard deviation. - Identify which store(s) showed good quarterly growth rate in Q3’2012. - Identify the holidays showing a higher sales than the mean sales of non-holiday stores in one view. - Provide monthly and semester view of sales in units along with insights.
# Check for missing values
walmart[!complete.cases(walmart)] # There does not appear to be any missing data
# Identify the stores
stores <- as.factor(walmart$Store)
store_count <- length(stores) # 45 stores
# Identify the weeks
weeks <- as.factor(walmart$Date)
week_count <- length(weeks) # 143 weeks
# Summary of sales by store
sales_summary <- data.frame(aggregate(walmart$Weekly_Sales,list(walmart$Store), FUN = sum))
View(sales_summary) # Views the returned data frame
max_sales_val <- max(sales_summary$x) # Highest sales: $301,397,792.46
max_sales_store <- sales_summary[sales_summary$x == max_sales_val, 1] # The store with highest sales, #20
# Plot the sales summary per store across the entire data set
sales_summary_bar <- ggplot(sales_summary, aes(x = factor(Group.1), y = x)) +
geom_col(fill = '#440154') +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.5)) +
labs(title = 'Yearly Sales', x = 'Store Number', y = 'Sales (in $)')
sales_summary_bar
# Std dev per store
store_stdev <- data.frame(aggregate(walmart$Weekly_Sales, list(walmart$Store), FUN = sd))
View(store_stdev) # Views the returned data frame
# Coefficient of mean to standard deviation per store
store_mean <- (tapply(walmart$Weekly_Sales, stores, mean))
store_mean_coeff <- numeric()
for (i in 1:length(store_mean)) {
store_mean_coeff[i] <- store_mean[i]/store_stdev$x[i]
}
store_stdev$store_mean_coeff <- store_mean_coeff
store_stdev$avg <- store_mean
# Do a final merge of dataframes sales_summary and store_stdev
sales_sum_final <- merge(sales_summary, store_stdev, by = "Group.1") # Group.1 is store number, x.x = sum of weekly sales per store, x.y = std dev of all weekly sales per store
View(sales_sum_final)
# Rename columns
sales_sum_final <- plyr::rename(sales_sum_final, c('Group.1' = 'id', 'x.x' = 'sales_sum', 'x.y' = 'std_dev'))
# Find store with the greatest standard deviation and coefficient of the mean to std dev
max_std_dev <- max(sales_sum_final$std_dev) # 317569.95
max_stdev_store <- sales_sum_final[sales_sum_final$std_dev == max_std_dev, 1] # Store 14
max_mean_coef <- max(sales_sum_final$store_mean_coeff) # 23.76
max_mc_store <- sales_sum_final[sales_sum_final$store_mean_coeff == max_mean_coef, 1] # Store 37
## Quarters of the year ##
# Q1: Jan, Feb, Mar #
# Q2: Apr, May, Jun #
# Q3: Jul, Aug, Sept #
# Q4: Oct, Nov, Dec #
##########################
q1 <- c('01', '02', '03')
q2 <- c('04','05','06')
q3 <- c('07','08','09')
q4 <- c('10', '11', '12')
# Create new column in walmart dataframe
quarters <- character()
# Iterate through the date column
for (i in 1:length(walmart$Date)) {
# Grab the date string
this_date <- unlist(strsplit(walmart$Date[i], "-"))
# Mark as Q1
if(this_date[2] %in% q1) {
quarters[i] <- 'Q1'
}
# Mark as Q2
else if(this_date[2] %in% q2) {
quarters[i] <- 'Q2'
}
# Mark as Q3
else if(this_date[2] %in% q3) {
quarters[i] <- 'Q3'
}
# Mark as Q4
else {
quarters[i] <- 'Q4'
}
}
# Add new column to walmart data frame
walmart$Quarter <- quarters
# Get summary of fiscal quarters
quarter_counts <- summary(as.factor(walmart$Quarter))
# Subset data for Q2 and Q3
# Q2
q2_weeklies <- subset(walmart,(substr(Date, 7, 10) %in% c("2012")) & (Quarter == 'Q2'), select = c(Store, Weekly_Sales))
# Q3
q3_weeklies <- subset(walmart,(substr(Date, 7, 10) %in% c("2012")) & (Quarter == 'Q3'), select = c(Store, Weekly_Sales))
# Combine split weekly data frames
q3_growth <- cbind(q2_weeklies, q3_weeklies$Weekly_Sales)
# Rename columns for clarity
q3_growth <- plyr::rename(q3_growth, c('Weekly_Sales' = 'q2_weekly_sales', 'q3_weeklies$Weekly_Sales' = 'q3_weekly_sales'))
# Create the growth comparison data frame
q3_growth_comparison <- data.frame(aggregate(cbind(q2_weekly_sales,q3_weekly_sales) ~ Store, data = q3_growth, FUN = sum))
# Comparison column
q3_growth_comparison$Q3_Growth_Rate <- ((q3_growth_comparison$q3_weekly_sales - q3_growth_comparison$q2_weekly_sales)/q3_growth_comparison$q2_weekly_sales)
# View
View(q3_growth_comparison)
# Which stores had a positive Q3'2012 growth rate
q3_pos_growth <- data.frame(subset(q3_growth_comparison, q3_growth_comparison$Q3_Growth_Rate > 0, select = c(Q3_Growth_Rate))) # 10 stores: 7, 16, 23, 24, 26, 35, 39, 40, 41, 44
# Line chart
q3_growth_chart <- ggplot(q3_growth_comparison, aes(x = factor(Store), y = Q3_Growth_Rate, group = 1)) +
geom_line()+
geom_point(aes(color = Q3_Growth_Rate > 0)) +
scale_color_manual(name = 'Growth Rate', values = setNames(c('green4', 'red3'), c(T, F)), labels = c('Positive', 'Negative')) +
labs(title = 'Q3 2012 Growth Rate by Store', x = 'Store Number', y = 'Growth Rate')
q3_growth_chart
# Create a new dataframe with the a holiday column
holiday <- data.frame(walmart[,c('Store', 'Date', 'Weekly_Sales', 'Holiday_Flag')])
View(holiday)
# Add an empty column to holiday
holiday[,'Holiday_Name'] <- NA
# Add a year column
holiday[, 'Year'] <- NA
# Add a month column
holiday[, 'Month'] <- NA
# Check against holiday flag + date column
# 1 = holiday, identify holiday based on month
# 0 = not a holiday
for (i in 1:length(holiday$Holiday_Flag)) {
# When a day is a holiday
if (holiday$Holiday_Flag[i] == 1) {
# Retrieve the date from Date
holi_date <- unlist(strsplit(holiday$Date[i], "-"))
# Focus on the month to find the specific holiday
holi_month <- holi_date[2]
# Superbowl
if (holi_month == '02') {
holiday$Holiday_Name[i] <- paste('Superbowl', holi_date[3])
}
# Labor Day
else if (holi_month == '09') {
holiday$Holiday_Name[i] <- paste('Labor Day', holi_date[3])
}
# Thanksgiving
else if (holi_month == '11') {
holiday$Holiday_Name[i] <- paste('Thanksgiving', holi_date[3])
}
# Christmas
else if (holi_month == '12') {
holiday$Holiday_Name[i] <- paste('Christmas', holi_date[3])
}
}
# When a day is not a holiday
else {
holiday$Holiday_Name[i] <- 'no holiday'
}
}
# Get year and month
for (i in 1:length(holiday$Date)) {
# Retrieve the date from Date
holi_date <- unlist(strsplit(holiday$Date[i], "-"))
# Get the year
holiday$Year[i] <- holi_date[3]
# Get the month
holiday$Month[i] <- holi_date[2]
}
##### ACROSS ALL YEARS, NON-HOLIDAY VS. HOLIDAY #####
# Create new dataframes for comparison
# No holidays
no_holidays <- data.frame(subset(holiday, Holiday_Name == 'no holiday', select = -c(Holiday_Flag)))
View(no_holidays)
# Aggregated averages
no_hol_avgs <- data.frame(aggregate(no_holidays$Weekly_Sales, list(no_holidays$Store), FUN = mean))
View(no_hol_avgs)
# Rename columns
no_hol_avgs <- plyr::rename(no_hol_avgs, c('Group.1' = 'Store', 'x' = 'Weekly_Avg_NonHoliday'))
## HOLIDAY DATA FRAMES ##
# Superbowl dataframes
superbowl_2010 <- data.frame(subset(holiday, Holiday_Name == 'Superbowl 2010', select = -c(Holiday_Flag)))
View(superbowl_2010)
superbowl_2011 <- data.frame(subset(holiday, Holiday_Name == 'Superbowl 2011', select = -c(Holiday_Flag)))
View(superbowl_2011)
superbowl_2012 <- data.frame(subset(holiday, Holiday_Name == 'Superbowl 2012', select = -c(Holiday_Flag)))
View(superbowl_2012)
# Labor Day dataframes
laborday_2010 <- data.frame(subset(holiday, Holiday_Name == 'Labor Day 2010', select = -c(Holiday_Flag)))
View(laborday_2010)
laborday_2011 <- data.frame(subset(holiday, Holiday_Name == 'Labor Day 2011', select = -c(Holiday_Flag)))
View(laborday_2011)
laborday_2012 <- data.frame(subset(holiday, Holiday_Name == 'Labor Day 2012', select = -c(Holiday_Flag)))
View(laborday_2012)
# Thanksgiving dataframes
thanksgiving_2010 <- data.frame(subset(holiday, Holiday_Name == 'Thanksgiving 2010', select = -c(Holiday_Flag)))
View(thanksgiving_2010)
thanksgiving_2011 <- data.frame(subset(holiday, Holiday_Name == 'Thanksgiving 2011', select = -c(Holiday_Flag)))
View(thanksgiving_2011)
# Christmas dataframes
christmas_2010 <- data.frame(subset(holiday, Holiday_Name == 'Christmas 2010', select = -c(Holiday_Flag)))
View(christmas_2010)
christmas_2011 <- data.frame(subset(holiday, Holiday_Name == 'Christmas 2011', select = -c(Holiday_Flag)))
View(christmas_2011)
# Merge yearly holiday columns to no_hol_avgs dataframe
# Superbowl
no_hol_avgs$Superbowl_2010 <- superbowl_2010$Weekly_Sales
no_hol_avgs$Superbowl_2011 <- superbowl_2011$Weekly_Sales
no_hol_avgs$Superbowl_2012 <- superbowl_2012$Weekly_Sales
# Labor Day
no_hol_avgs$LaborDay_2010 <- laborday_2010$Weekly_Sales
no_hol_avgs$LaborDay_2011 <- laborday_2011$Weekly_Sales
no_hol_avgs$LaborDay_2012 <- laborday_2012$Weekly_Sales
# Thanksgiving
no_hol_avgs$Thanksgiving_2010 <- thanksgiving_2010$Weekly_Sales
no_hol_avgs$Thanksgiving_2011 <- thanksgiving_2011$Weekly_Sales
# Christmas
no_hol_avgs$Christmas_2010 <- christmas_2010$Weekly_Sales
no_hol_avgs$Christmas_2011 <- christmas_2011$Weekly_Sales
## COMPARISONS ##
# Superbowl
no_hol_avgs$superbowl_2010_comp <- ifelse(no_hol_avgs$Superbowl_2010 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
no_hol_avgs$superbow_2011_comp <- ifelse(no_hol_avgs$Superbowl_2011 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
no_hol_avgs$superbowl_2012_comp <- ifelse(no_hol_avgs$Superbowl_2012 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
# Labor Day
no_hol_avgs$laborday_2010_comp <- ifelse(no_hol_avgs$LaborDay_2010 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
no_hol_avgs$laborday_2011_comp <- ifelse(no_hol_avgs$LaborDay_2011 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
no_hol_avgs$laborday_2012_comp <- ifelse(no_hol_avgs$LaborDay_2012 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
# Thanksgiving
no_hol_avgs$thanksgiving_2010_comp <- ifelse(no_hol_avgs$Thanksgiving_2010 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
no_hol_avgs$thanksgiving_2011_comp <- ifelse(no_hol_avgs$Thanksgiving_2011 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
# Christmas
no_hol_avgs$christmas_2010_comp <- ifelse(no_hol_avgs$Christmas_2010 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
no_hol_avgs$christmas_2011_comp <- ifelse(no_hol_avgs$Christmas_2011 > no_hol_avgs$Weekly_Avg_NonHoliday, "greater", "not greater")
##### WITHIN YEAR 2010: NON-HOLIDAY VS. HOLIDAY #####
nh_2010 <- data.frame(subset(no_hol_avgs, select = -c(2:22)))
# February 2010
nh_Feb2010 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2010' & Month == '02') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# September 2010
nh_Sept2010 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2010' & Month == '09') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# November 2010
nh_Nov2010 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2010' & Month == '11') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# December 2010
nh_Dec2010 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2010' & Month == '12') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# Bind columns
nh_2010 <- cbind(nh_2010, superbowl_2010$Weekly_Sales, nh_Feb2010$Mean, laborday_2010$Weekly_Sales, nh_Sept2010$Mean, thanksgiving_2010$Weekly_Sales, nh_Nov2010$Mean, christmas_2010$Weekly_Sales, nh_Dec2010$Mean)
# Add comparison columns
nh_2010$superbowl_comparison <- ifelse(superbowl_2010$Weekly_Sales > nh_Feb2010$Mean, "greater", "not greater") # Superbowl
nh_2010$laborday_comparison <- ifelse(laborday_2010$Weekly_Sales > nh_Sept2010$Mean, "greater", "not greater") # Labor Day
nh_2010$thanksgiving_comparison <- ifelse(thanksgiving_2010$Weekly_Sales > nh_Nov2010$Mean, "greater", "not greater") # Thanksgiving
nh_2010$christmas_comparison <- ifelse(christmas_2010$Weekly_Sales > nh_Dec2010$Mean, "greater", "not greater") # Christmas
# View dataframe
View(nh_2010)
##### WITHIN YEAR 2011: NON-HOLIDAY VS. HOLIDAY #####
nh_2011 <- data.frame(subset(no_hol_avgs, select = -c(2:22)))
# February 2011
nh_Feb2011 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2011' & Month == '02') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# September 2011
nh_Sept2011 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2011' & Month == '09') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# November 2011
nh_Nov2011 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2011' & Month == '11') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# December 2011
nh_Dec2011 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2011' & Month == '12') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# Bind columns
nh_2011 <- cbind(nh_2011, superbowl_2011$Weekly_Sales, nh_Feb2011$Mean, laborday_2011$Weekly_Sales, nh_Sept2011$Mean, thanksgiving_2011$Weekly_Sales, nh_Nov2011$Mean, christmas_2011$Weekly_Sales, nh_Dec2011$Mean)
# Add comparison columns
nh_2011$superbowl_comparison <- ifelse(superbowl_2011$Weekly_Sales > nh_Feb2011$Mean, "greater", "not greater") # Superbowl
nh_2011$laborday_comparison <- ifelse(laborday_2011$Weekly_Sales > nh_Sept2011$Mean, "greater", "not greater") # Labor Day
nh_2011$thanksgiving_comparison <- ifelse(thanksgiving_2011$Weekly_Sales > nh_Nov2011$Mean, "greater", "not greater") # Thanksgiving
nh_2011$christmas_comparison <- ifelse(christmas_2011$Weekly_Sales > nh_Dec2011$Mean, "greater", "not greater") # Christmas
# View dataframe
View(nh_2011)
##### WITHIN YEAR 2012: NON-HOLIDAY VS. HOLIDAY #####
nh_2012 <- data.frame(subset(no_hol_avgs, select = -c(2:22)))
# February 2011
nh_Feb2012 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2012' & Month == '02') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# September 2012
nh_Sept2012 <- holiday %>%
filter(Holiday_Flag == 0 & Year == '2012' & Month == '09') %>%
group_by(Store) %>%
summarise(Mean = mean(Weekly_Sales))
# Bind columns
nh_2012 <- cbind(nh_2012, superbowl_2012$Weekly_Sales, nh_Feb2012$Mean, laborday_2012$Weekly_Sales, nh_Sept2012$Mean)
# Add comparison columns
nh_2012$superbowl_comparison <- ifelse(superbowl_2012$Weekly_Sales > nh_Feb2012$Mean, "greater", "not greater") # Superbowl
nh_2012$laborday_comparison <- ifelse(laborday_2012$Weekly_Sales > nh_Sept2012$Mean, "greater", "not greater") # Labor Day
# View dataframe
View(nh_2012)
## 2010 Superbowl Comparison
# Convert data frame into long format
nh_2010sb.long <- nh_2010 %>%
select('Store','superbowl_2010$Weekly_Sales', 'nh_Feb2010$Mean') %>%
pivot_longer(-Store, names_to = 'sbvars2010', values_to = 'sbvals2010')
# Create the visualization
sb2010_grids <- ggplot(nh_2010sb.long, aes(x = factor(Store), y= sbvals2010, color = sbvars2010, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Superbowl 2010', 'Regular Monthly Average.')) +
labs(title = 'February Average Sales vs. Superbowl: 2010', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
sb2010_grids
# Create the gt table object
feb_avgs <- gt(tibble(nh_2010 %>% filter(nh_2010$`nh_Feb2010$Mean` < nh_2010$`superbowl_2010$Weekly_Sales`)))
# Format
feb_avgs <- feb_avgs %>% tab_header(title = 'Stores with Higher Superbowl Sales vs February Non-Holiday Average Sale',
subtitle = '2010') %>%
cols_hide(columns = c(4:13))
# Display
feb_avgs
## 2011 Superbowl Comparison
# Convert data frame into long format
nh_2011sb.long <- nh_2011 %>%
select('Store','superbowl_2011$Weekly_Sales', 'nh_Feb2011$Mean') %>%
pivot_longer(-Store, names_to = 'sbvars2011', values_to = 'sbvals2011')
# Create the visualization
sb2011_grids <- ggplot(nh_2011sb.long, aes(x = factor(Store), y= sbvals2011, color = sbvars2011, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Superbowl 2011', 'Regular Monthly Average.')) +
labs(title = 'February Average Sales vs. Superbowl: 2011', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
sb2011_grids
# Create the gt table object
feb2011_avgs <- gt(tibble(nh_2011 %>% filter(nh_2011$`nh_Feb2011$Mean` < nh_2011$`superbowl_2011$Weekly_Sales`)))
# Format
feb2011_avgs <- feb2011_avgs %>% tab_header(title = 'Stores with Higher Superbowl Sales vs February Non-Holiday Average Sale',
subtitle = '2011') %>%
cols_hide(columns = c(4:13))
# Display
feb2011_avgs
## 2012 Superbowl Comparison
# Convert data frame into long format
nh_2012sb.long <- nh_2012 %>%
select('Store','superbowl_2012$Weekly_Sales', 'nh_Feb2012$Mean') %>%
pivot_longer(-Store, names_to = 'sbvars2012', values_to = 'sbvals2012')
# Create the visualization
sb2012_grids <- ggplot(nh_2012sb.long, aes(x = factor(Store), y= sbvals2012, color = sbvars2012, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Superbowl 2012', 'Regular Monthly Average.')) +
labs(title = 'February Average Sales vs. Superbowl: 2012', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
sb2012_grids
# Create the gt table object
feb2012_avgs <- gt(tibble(nh_2012 %>% filter(nh_2012$`nh_Feb2012$Mean` < nh_2012$`superbowl_2012$Weekly_Sales`)))
# Format
feb2012_avgs <- feb2012_avgs %>% tab_header(title = 'Stores with Higher Superbowl Sales vs February Non-Holiday Average Sale',
subtitle = '2012') %>%
cols_hide(columns = c(4:7))
# Display
feb2012_avgs
All non-holiday seasons vs. all superbowl years
# Convert dataframe into long format
no_hol_avgs.long <- no_hol_avgs %>%
select('Store','Weekly_Avg_NonHoliday', 'Superbowl_2010', 'Superbowl_2011', 'Superbowl_2012') %>%
pivot_longer(-Store, names_to = 'sales_vars', values_to = 'sales_vals')
# Create the visualization
sb_grids <- ggplot(no_hol_avgs.long, aes(x = factor(Store), y= sales_vals, color = sales_vars, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Superbowl 2010', 'Superbowl 2011', 'Superbowl 2012', 'Non-Holiday Weekly Avg.')) +
labs(title = 'Weekly Average Sales vs. Superbowl: 2010 thru 2012', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
sb_grids
## 2010 laborday Comparison
# Convert data frame into long format
nh_2010ld.long <- nh_2010 %>%
select('Store','laborday_2010$Weekly_Sales', 'nh_Sept2010$Mean') %>%
pivot_longer(-Store, names_to = 'ldvars2010', values_to = 'ldvals2010')
# Create the visualization
ld2010_grids <- ggplot(nh_2010ld.long, aes(x = factor(Store), y= ldvals2010, color = ldvars2010, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Labor Day 2010', 'Regular Monthly Average.')) +
labs(title = 'September Average Sales vs. Labor Day: 2010', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
ld2010_grids
# Create the gt table object
sept_avgs <- gt(tibble(nh_2010 %>% filter(nh_2010$`nh_Sept2010$Mean` < nh_2010$`laborday_2010$Weekly_Sales`)))
# Format
sept_avgs <- sept_avgs %>% tab_header(title = 'Stores with Higher Labor Day Sales vs Sept Non-Holiday Average Sale',
subtitle = '2010') %>%
cols_hide(columns = c(2:3,6:13))
# Display
sept_avgs
## 2011 laborday Comparison
# Convert data frame into long format
nh_2011ld.long <- nh_2011 %>%
select('Store','laborday_2011$Weekly_Sales', 'nh_Sept2011$Mean') %>%
pivot_longer(-Store, names_to = 'ldvars2011', values_to = 'ldvals2011')
# Create the visualization
ld2011_grids <- ggplot(nh_2011ld.long, aes(x = factor(Store), y= ldvals2011, color = ldvars2011, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Labor Day 2011', 'Regular Monthly Average.')) +
labs(title = 'September Average Sales vs. Labor Day: 2011', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
ld2011_grids
# Create the gt table object
sept2011_avgs <- gt(tibble(nh_2011 %>% filter(nh_2011$`nh_Sept2011$Mean` < nh_2011$`laborday_2011$Weekly_Sales`)))
# Format
sept2011_avgs <- sept2011_avgs %>% tab_header(title = 'Stores with Higher Labor Day Sales vs Sept Non-Holiday Average Sale',
subtitle = '2011') %>%
cols_hide(columns = c(2:3,6:13))
# Display
sept2011_avgs
## 2012 laborday Comparison
# Convert data frame into long format
nh_2012ld.long <- nh_2012 %>%
select('Store','laborday_2012$Weekly_Sales', 'nh_Sept2012$Mean') %>%
pivot_longer(-Store, names_to = 'ldvars2012', values_to = 'ldvals2012')
# Create the visualization
ld2012_grids <- ggplot(nh_2012ld.long, aes(x = factor(Store), y= ldvals2012, color = ldvars2012, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Labor Day 2012', 'Regular Monthly Average.')) +
labs(title = 'September Average Sales vs. Labor Day: 2012', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
ld2012_grids
# Create the gt table object
sept2012_avgs <- gt(tibble(nh_2012 %>% filter(nh_2012$`nh_Sept2012$Mean` < nh_2012$`laborday_2012$Weekly_Sales`)))
# Format
sept2012_avgs <- sept2012_avgs %>% tab_header(title = 'Stores with Higher Labor Day Sales vs Sept Non-Holiday Average Sale',
subtitle = '2012') %>%
cols_hide(columns = c(2:3,6:7))
# Display
sept2012_avgs
All non-holiday vs. all Labor Day years
# Convert dataframe into long format
no_hol_avgs1.long <- no_hol_avgs %>%
select('Store','Weekly_Avg_NonHoliday', 'LaborDay_2010', 'LaborDay_2011', 'LaborDay_2012') %>%
pivot_longer(-Store, names_to = 'labor_vars', values_to = 'labor_vals')
# Create the visualization
labor_grids <- ggplot(no_hol_avgs1.long, aes(x = factor(Store), y= labor_vals, color = labor_vars, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Labor Day 2010', 'Labor Day 2011', 'Labor Day 2012', 'Non-Holiday Weekly Avg.')) +
labs(title = 'Weekly Average Sales vs. Labor Day: 2010 thru 2012', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
labor_grids
## 2010 Thanksgiving Comparison
# Convert data frame into long format
nh_2010t.long <- nh_2010 %>%
select('Store','thanksgiving_2010$Weekly_Sales', 'nh_Nov2010$Mean') %>%
pivot_longer(-Store, names_to = 'tvars2010', values_to = 'tvals2010')
# Create the visualization
t2010_grids <- ggplot(nh_2010t.long, aes(x = factor(Store), y= tvals2010, color = tvars2010, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Thanksgiving 2010', 'Regular Monthly Average.')) +
labs(title = 'November Average Sales vs. Thanksgiving: 2010', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
t2010_grids
# Create the gt table object
nov_avgs <- gt(tibble(nh_2010 %>% filter(nh_2010$`nh_Nov2010$Mean` < nh_2010$`thanksgiving_2010$Weekly_Sales`)))
# Format
nov_avgs <- nov_avgs %>% tab_header(title = 'Stores with Higher Thanksgiving Sales vs Nov Non-Holiday Average Sale',
subtitle = '2010') %>%
cols_hide(columns = c(2:5,8:13))
# Display
nov_avgs
## 2011 Thanksgiving Comparison
# Convert data frame into long format
nh_2011t.long <- nh_2011 %>%
select('Store','thanksgiving_2011$Weekly_Sales', 'nh_Nov2011$Mean') %>%
pivot_longer(-Store, names_to = 'tvars2011', values_to = 'tvals2011')
# Create the visualization
t2011_grids <- ggplot(nh_2011t.long, aes(x = factor(Store), y= tvals2011, color = tvars2011, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Thanksgiving 2011', 'Regular Monthly Average.')) +
labs(title = 'November Average Sales vs. Thanksgiving: 2011', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
t2011_grids
# Create the gt table object
nov2011_avgs <- gt(tibble(nh_2011 %>% filter(nh_2011$`nh_Nov2011$Mean` < nh_2011$`thanksgiving_2011$Weekly_Sales`)))
# Format
nov2011_avgs <- nov2011_avgs %>% tab_header(title = 'Stores with Higher Thanksgiving Sales vs Nov Non-Holiday Average Sale',
subtitle = '2011') %>%
cols_hide(columns = c(2:5,8:13))
# Display
nov2011_avgs
All non-holiday vs. all Thanksgiving years
# Convert dataframe into long format
no_hol_avgs2.long <- no_hol_avgs %>%
select('Store','Weekly_Avg_NonHoliday', 'Thanksgiving_2010', 'Thanksgiving_2011') %>%
pivot_longer(-Store, names_to = 'turkey_vars', values_to = 'turkey_vals')
# Create the visualization
turkey_grids <- ggplot(no_hol_avgs2.long, aes(x = factor(Store), y= turkey_vals, color = turkey_vars, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Thanksgiving 2010', 'Thanksgiving 2011', 'Non-Holiday Weekly Avg.')) +
labs(title = 'Weekly Average Sales vs. Thanksgiving: 2010 thru 2011', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
turkey_grids
## 2010 Christmas Comparison
# Convert data frame into long format
nh_2010c.long <- nh_2010 %>%
select('Store','christmas_2010$Weekly_Sales', 'nh_Dec2010$Mean') %>%
pivot_longer(-Store, names_to = 'cvars2010', values_to = 'cvals2010')
# Create the visualization
c2010_grids <- ggplot(nh_2010c.long, aes(x = factor(Store), y= cvals2010, color = cvars2010, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Christmas 2010', 'Regular Monthly Average.')) +
labs(title = 'December Average Sales vs. Christmas: 2010', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
c2010_grids
# Create the gt table object
dec_avgs <- gt(tibble(nh_2010 %>% filter(nh_2010$`nh_Dec2010$Mean` < nh_2010$`christmas_2010$Weekly_Sales`)))
# Format
dec_avgs <- dec_avgs %>% tab_header(title = 'Stores with Higher Christmas Sales vs Nov Non-Holiday Average Sale',
subtitle = '2010') %>%
cols_hide(columns = c(2:7,10:13))
# Display
dec_avgs
## 2011 Christmas Comparison
# Convert data frame into long format
nh_2011c.long <- nh_2011 %>%
select('Store','christmas_2011$Weekly_Sales', 'nh_Dec2011$Mean') %>%
pivot_longer(-Store, names_to = 'cvars2011', values_to = 'cvals2011')
# Create the visualization
c2011_grids <- ggplot(nh_2011c.long, aes(x = factor(Store), y= cvals2011, color = cvars2011, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Christmas 2011', 'Regular Monthly Average.')) +
labs(title = 'December Average Sales vs. Christmas: 2011', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
c2011_grids
# Create the gt table object
dec2011_avgs <- gt(tibble(nh_2011 %>% filter(nh_2011$`nh_Dec2011$Mean` < nh_2011$`christmas_2011$Weekly_Sales`)))
# Format
dec2011_avgs <- dec2011_avgs %>% tab_header(title = 'Stores with Higher Christmas Sales vs Nov Non-Holiday Average Sale',
subtitle = '2011') %>%
cols_hide(columns = c(2:7,10:13))
# Display
dec2011_avgs
All non-holiday vs. all Christmas years
# Convert dataframe into long format
no_hol_avgs3.long <- no_hol_avgs %>%
select('Store','Weekly_Avg_NonHoliday', 'Christmas_2010', 'Christmas_2011') %>%
pivot_longer(-Store, names_to = 'xmas_vars', values_to = 'xmas_vals')
# Create the visualization
xmas_grids <- ggplot(no_hol_avgs3.long, aes(x = factor(Store), y= xmas_vals, color = xmas_vars, group = 1)) +
geom_line() +
geom_point() +
scale_color_viridis_d(labels = c('Christmas 2010', 'Christmas 2011', 'Non-Holiday Weekly Avg.')) +
labs(title = 'Weekly Average Sales vs. Christmas: 2010 thru 2011', x = 'Store', y = 'Sales Amount', color = 'Sales Event')
xmas_grids
# Set up a monthly view
monthly <- data.frame(walmart[, c('Store', 'Date', 'Weekly_Sales', 'Holiday_Flag')])
View(monthly)
# Create a month column
monthly[,'Month'] <- NA
# Create a year column
monthly[,'Year'] <- NA
# Create a semester column
monthly[,'Semester'] <- NA
sem1 <- c("01", "02", "03", "04", "05", "06") # Semester 1
sem2 <- c("07", "08", "09", "10", "11", "12") # Semester 2
for (i in 1:length(monthly$Store)) {
# Retrieve the date from Date
monthly_date <- unlist(strsplit(monthly$Date[i], "-"))
# Focus on the month
monthly$Month[i] <- switch(monthly_date[2],
"01" = "Jan",
"02" = "Feb",
"03" = "Mar",
"04" = "Apr",
"05" = "May",
"06" = "Jun",
"07" = "Jul",
"08" = "Aug",
"09" = "Sept",
"10" = "Oct",
"11" = "Nov",
"12" = "Dec")
# Set the year
monthly$Year[i] <- monthly_date[3]
# Set the semester
if(monthly_date[2] %in% sem1) {
monthly$Semester[i] <- 'Semester 1'
}
else if(monthly_date[2] %in% sem2) {
monthly$Semester[i] <- 'Semester 2'
}
else {
monthly$Semester[i] <- 'Invalid month'
}
}
# Aggregate by month and year
monthly_sums <- data.frame(aggregate(Weekly_Sales ~ Store + Year + Month, data = monthly, FUN = sum))
View(monthly_sums)
# Aggregate by semester
semester_sums <- data.frame(aggregate(Weekly_Sales ~ Store + Year + Semester, data = monthly, FUN = sum))
View(semester_sums)
# Monthly plots
monthly2010_bars <- ggplot(subset(monthly_sums, Year %in% 2010), aes(x = factor(Store), y = Weekly_Sales, fill = factor(Month, levels= c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec')))) +
geom_col(position = "fill") +
scale_fill_viridis_d(limits = c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec')) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.5)) +
labs(title = '2010 Monthly Sales per Store: Proportional View', x = 'Store Number', y = 'Monthly Sales', fill = 'Month')
# Show the plot
monthly2010_bars
# Monthly plots
monthly2011_bars <- ggplot(subset(monthly_sums, Year %in% 2011), aes(x = factor(Store), y = Weekly_Sales, fill = factor(Month, levels= c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec')))) +
geom_col(position = "fill") +
scale_fill_viridis_d(limits = c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec')) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.5)) +
labs(title = '2011 Monthly Sales per Store: Proportional View', x = 'Store Number', y = 'Monthly Sales', fill = 'Month')
# Show the plot
monthly2011_bars
# Monthly plots
monthly2012_bars <- ggplot(subset(monthly_sums, Year %in% 2012), aes(x = factor(Store), y = Weekly_Sales, fill = factor(Month, levels= c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec')))) +
geom_col(position = "fill") +
scale_fill_viridis_d(limits = c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec')) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.5)) +
labs(title = '2012 Monthly Sales per Store: Proportional View', x = 'Store Number', y = 'Monthly Sales', fill = 'Month')
# Show the plot
monthly2012_bars
# Semester plot
sem_bars <- ggplot(semester_sums, aes(x = factor(Store), y = Weekly_Sales, fill = factor(Semester))) +
geom_col(position = "fill") +
scale_fill_viridis_d() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.5)) +
labs(title = 'Semester Sales per Store: Proportional View', x = 'Store Number', y = 'Semester Sales', fill = 'Semester')
# Show the plot
sem_bars
For Store 1, build prediction models to forecast demand - Linear regression: - Restructure dates beginning at 1 for the earliest date in the order. - Change dates into days by creating a new variable. - Hypothesize if CPI, unemployment, and fuel price have any impact on sales.
# View data points by store
walmart_stores <- summary(as.factor(walmart$Store)) # Verified: 143 data points per store
# Create a new dataframe for the regression analysis
walmart_regression <- data.frame(subset(walmart, Store %in% 1))
View(walmart_regression)
# Date conversion
walmart_regression$day <- c(1:length(walmart_regression$Store))
# Drop unnecessary columns
walmart_regression <- subset(walmart_regression, select = -c(Store, Date, Holiday_Flag, Quarter))
# Scatter plot
par(mfrow = c(3,2)) # Formats the display
for (i in 2:6) {
plot(walmart_regression[,i],
walmart_regression$Weekly_Sales,
main = names(walmart_regression[i]),
ylab = names(walmart_regression$Weekly_Sales),
xlab = '', col = '#440154')
}
par(mfrow = c(1,1))
The heatmap below shows weak correlations between the Weekly Sales dependent variables and the independent variables of Temperature, Fuel_Price, CPI, and Unemployment. - Temperature and Unemployment have an inverse relationship with Weekly Sales. - Fuel_Price and CPI have a direct (positive) relationship with Weekly Sales.
Of concern in this model are the high correlations between Fuel_Price and CPI, Fuel_Price and Unemployment, and CPI and Unemployment. However, they do not appear to be statistically significant.
walmart_mcc <- cor(walmart_regression[, 1:6]) # Contains the correlation matrix
View(walmart_mcc)
corrplot(walmart_mcc, method = 'color', outline = T, cl.pos = 'n', rect.col = 'black', tl.col = 'indianred4', addCoef.col = "black", number.digits = 2, number.cex = 0.60, tl.cex = 0.7, cl.cex = 1, col = colorRampPalette(c("green4","white","red"))(100))
# Split data into train and test with and 80%-20% split
set.seed(1234) # Set the seed
sample_mvr1 <- sample.split(walmart_regression, SplitRatio = 0.80) #Set the split proportions
train <- subset(walmart_regression, sample_mvr1 == T) # Training data
test <- subset(walmart_regression, sample_mvr1 == F) # Test data
Uses day and assumes all predictor variables are continuous as well as the dependent variable. This analysis will use an alpha level of 0.05.
# Create the base model regression
model_base <- lm(formula = Weekly_Sales ~., data = train)
# View the base model summary
summary(model_base)
From the summary above, none of the included variables reach significance at the 0.05 level. However, the F-statistic being significant at the 0.05 alpha level indicates that all variables together yield a model that fits the data.
# Prediction
base_train_pred <- predict(model_base, newdata = train)
# Number of predictions
length(base_train_pred)
base_train_pred
# Visualization
base_training_chart <- ggplot() +
geom_point(aes(x = train$Weekly_Sales, y = base_train_pred)) +
labs(title = 'Weekly Sales [Base Model]: Training Set vs Predictions', x = 'Actual Weekly Sales', y = 'Predicted Weekly Sales')
base_training_chart
# Set up the model for the test data set
base_test_pred <- predict(model_base, newdata = test)
base_test_pred
# Visualization
base_test_chart <- ggplot() +
geom_point(aes(x = test$Weekly_Sales, y = base_test_pred)) +
labs(title = 'Weekly Sales [Base Model]: Test Set vs Predictions', x = 'Actual Weekly Sales', y = 'Predicted Weekly Sales')
base_test_chart
# Mean Absolute Percentage Error
mape_base <- MAPE(base_test_pred, test$Weekly_Sales)
# Root Mean Square Error
rmse_base <- RMSE(base_test_pred, test$Weekly_Sales)
print(paste('The base mean absolute percentage error is ', mape_base))
print(paste('The base root mean square error is ', rmse_base))
The mean absolute percentage error value is 5.9%. The MAPE standard for evaluating a MAPE value depends on the industry, so without knowing what the MAPE is for retail, it is hard to say if this model alone produces acceptable accuracy. However, it can serve as the baseline comparison against which other models will be judged.
The root mean square error serves as a way to evaluate the fit of a model to the data. It is best in a comparison role against other models, so the value of the base model here will serve as a baseline for other models.
Due to the nature of the Day variable, it will be removed from the next analysis to see if the model improves any.
Assuming all predictor variables are continuous as well as the dependent variable. For this analysis we will adopt an alpha level of 0.05.
# Remove day from train and test
train <- subset(train, select = -c(day))
test <- subset(test, select = -c(day))
# Set the linear regression formula
model_mvlin1 <- lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + CPI + Unemployment, data = train)
# View the model summary
summary(model_mvlin1)
From the summary above, it looks as if only CPI is a significant predictor of Weekly_Sales. However, the F-statistic is significant at the selected alpha level for the analysis, so the overall model can be accepted as a fit for the data.
# Prediction
weekly_sales_train_pred <- predict(model_mvlin1, newdata = train)
# Number of predictions
length(weekly_sales_train_pred)
weekly_sales_train_pred
# Visualization
training_chart <- ggplot() +
geom_point(aes(x = train$Weekly_Sales, y = weekly_sales_train_pred)) +
labs(title = 'Weekly Sales: Training Set vs Predictions', x = 'Actual Weekly Sales', y = 'Predicted Weekly Sales')
training_chart
# Set up the model for the test data set
weekly_sales_test_pred <- predict(model_mvlin1, newdata = test)
weekly_sales_test_pred
# Visualization
test_chart <- ggplot() +
geom_point(aes(x = test$Weekly_Sales, y = weekly_sales_test_pred)) +
labs(title = 'Weekly Sales: Test Set vs Predictions', x = 'Actual Weekly Sales', y = 'Predicted Weekly Sales')
test_chart
# Mean Absolute Percentage Error
mape1 <- MAPE(weekly_sales_test_pred, test$Weekly_Sales)
# Root Mean Square Error
rmse1 <- RMSE(weekly_sales_test_pred, test$Weekly_Sales)
print(paste('The mean absolute percentage error for model 1 is ', mape1))
print(paste('The root mean square error for model 1 is ', rmse1))
The mean absolute percentage error for this model is almost identical to the first model. Same with the root mean square error value. Therefore, despite the one statistically significant predictor variable, this model cannot be said to be better than the base model.
This final model will treat CPI as a continuous variable, but will remove Fuel_Price and Unemployment from the prediction model due to their results in the correlation matrix created at the start of the analysis.
# Set the linear regression formula
final_model <- lm(formula = Weekly_Sales ~ Temperature + CPI, data = train)
# View the model summary
summary(final_model)
This particular model shows both predictor variables as statistically significant. Furthermore, the model itself is statistically significant, indicating it is a good fit for the data.
# Prediction
final_pred <- predict(final_model, newdata = train)
# Number of predictions
length(final_pred)
final_pred
# Visualization
final_chart1 <- ggplot() +
geom_point(aes(x = train$Weekly_Sales, y = final_pred)) +
labs(title = 'Weekly Sales[Final Model]: Training Set vs Predictions', x = 'Actual Weekly Sales', y = 'Predicted Weekly Sales')
final_chart1
# Set up the model for the test data set
final_test_pred <- predict(final_model, newdata = test)
final_test_pred
# Visualization
final_test_chart <- ggplot() +
geom_point(aes(x = test$Weekly_Sales, y = final_test_pred)) +
labs(title = 'Weekly Sales [Final Model]: Test Set vs Predictions', x = 'Actual Weekly Sales', y = 'Predicted Weekly Sales')
final_test_chart
# Mean Absolute Percentage Error
final_mape <- MAPE(final_test_pred, test$Weekly_Sales)
# Root Mean Square Error
final_rmse <- RMSE(final_test_pred, test$Weekly_Sales)
print(paste('The mean absolute percentage error for the final model is ', final_mape))
print(paste('The root mean square error for the final model is ', final_rmse))
This final model offers a mean absolute percentage error of 5.6%, versus the 5.9% of the base model. Therefore, this model can be judged as more accurate. The root mean square error is also less for this model than the base model, so it is a better fit than the base model.
\(y_{pred} = -2500.2*temp + 9969.3*cpi - 431311.4\)