1 Introduction

This analysis explores the Superstore Dataset to identify customer purchase patterns, sales trends, and correlations among various attributes. Visualization techniques, time series analysis, and word clouds are used to enhance insights.

2 Import the dataset

Supersport <- read_excel("C:/Users/DELL/Desktop/Class_materials/Sample - Superstore.xls", sheet = 1)
Supersport
## # A tibble: 9,994 × 21
##    `Row ID` `Order ID`     `Order Date`        `Ship Date`         `Ship Mode`  
##       <dbl> <chr>          <dttm>              <dttm>              <chr>        
##  1        1 CA-2016-152156 2016-11-08 00:00:00 2016-11-11 00:00:00 Second Class 
##  2        2 CA-2016-152156 2016-11-08 00:00:00 2016-11-11 00:00:00 Second Class 
##  3        3 CA-2016-138688 2016-06-12 00:00:00 2016-06-16 00:00:00 Second Class 
##  4        4 US-2015-108966 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard Cla…
##  5        5 US-2015-108966 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard Cla…
##  6        6 CA-2014-115812 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard Cla…
##  7        7 CA-2014-115812 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard Cla…
##  8        8 CA-2014-115812 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard Cla…
##  9        9 CA-2014-115812 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard Cla…
## 10       10 CA-2014-115812 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard Cla…
## # ℹ 9,984 more rows
## # ℹ 16 more variables: `Customer ID` <chr>, `Customer Name` <chr>,
## #   Segment <chr>, Country <chr>, City <chr>, State <chr>, `Postal Code` <dbl>,
## #   Region <chr>, `Product ID` <chr>, Category <chr>, `Sub-Category` <chr>,
## #   `Product Name` <chr>, Sales <dbl>, Quantity <dbl>, Discount <dbl>,
## #   Profit <dbl>

2.1 Sales Analysis

2.1.1 Selecting Relevant Columns

The analysis focuses on the key columns related to sales, discounts, and profits.

SUPER <- Supersport %>% select(`Order Date`, Sales, Quantity, Discount, Profit) 

2.2 Customer Purchase Analysis

2.2.1 Top Customers by Purchase Count

This section identifies the top 20 customers by the number of purchases and visualizes the results with a bar chart.

customer_purchases <- Supersport %>%   group_by(`Customer ID`) %>%   summarize(Purchase_Count = n()) %>%   arrange(desc(Purchase_Count))  

3 Top 20 customers by purchase count

 customer_purchases_T <- customer_purchases %>%   slice_max(order_by = Purchase_Count, n = 20)  

4 Bar chart for customer purchases

ggplot(customer_purchases_T, aes(x = reorder(`Customer ID`, Purchase_Count), y = Purchase_Count)) +   geom_bar(stat = "identity", fill = "steelblue") +   coord_flip() +   labs(title = "Customer Purchase Counts", x = "Customer ID", y = "Number of Purchases") +   theme_minimal() +   theme(     plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),     axis.title.x = element_text(size = 12),     axis.title.y = element_text(size = 12),     axis.text = element_text(size = 10),     panel.grid.major = element_blank(),     panel.grid.minor = element_blank()   ) 

4.0.1 Word Cloud for Customer Purchases

A word cloud visualization shows customer purchase behavior.

customer_purchases %>%   with(wordcloud(`Customer ID`, Purchase_Count, min.freq = 2, random.order = FALSE,                  scale = c(3, 0.5), max.words = Inf, rot.per = 0.35,                  colors = brewer.pal(8, "Dark2"), fixed.asp = TRUE)) 

4.1 Sales by Ship Mode

4.1.1 Sales and Profit by Ship Mode

This section summarizes total sales and profits by ship mode and visualizes the results.

ship_mode <- Supersport %>%   group_by(`Ship Mode`) %>%   summarize(     sum_sales = sum(Sales),     sum_profit = sum(Profit),     sum_quantity = sum(Quantity),     sum_discount = sum(Discount)   ) 

4.1.2

customer_purchases_name <- Supersport%>%
  group_by(`Customer Name`) %>%
  summarize(Purchase_Count = n())%>%arrange(desc(Purchase_Count))
par(mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0))
customer_purchases_name%>%with(wordcloud(`Customer Name`, Purchase_Count, min.freq = 2, random.order = FALSE, scale = c(3, 0.5),max.words = Inf,rot.per = 0.35,colors = brewer.pal(8, "Dark2"), fixed.asp = TRUE))

5

customer_purchases_ID <- Supersport%>% group_by(`Product ID`) %>% summarize(Purchase_Count = n())%>%arrange(desc(Purchase_Count))
par(mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0))

customer_purchases_ID%>%with(wordcloud(`Product ID`, Purchase_Count, min.freq = 2, random.order = FALSE, scale = c(3, 0.5),max.words = Inf,rot.per = 0.35,colors = brewer.pal(8, "Dark2"), fixed.asp = TRUE))

customer_purchases_PR <- Supersport%>% group_by(`Product Name`) %>% summarize(Purchase_Count = n())
par(mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0))

customer_purchases_PR%>%with(wordcloud(`Product Name`, Purchase_Count, min.freq = 2, random.order = FALSE, scale = c(3, 0.5),max.words = Inf,rot.per = 0.35,colors = brewer.pal(8, "Dark2"), fixed.asp = TRUE))

6 Correlation analysis

cor.test(Supersport$Sales, Supersport$Quantity) 
## 
##  Pearson's product-moment correlation
## 
## data:  Supersport$Sales and Supersport$Quantity
## t = 20.489, df = 9992, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1819049 0.2195365
## sample estimates:
##       cor 
## 0.2007948
cor.test(Supersport$Sales, Supersport$Discount) 
## 
##  Pearson's product-moment correlation
## 
## data:  Supersport$Sales and Supersport$Discount
## t = -2.819, df = 9992, p-value = 0.004827
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.047769675 -0.008588918
## sample estimates:
##         cor 
## -0.02819012
cor.test(Supersport$Sales, Supersport$Profit) 
## 
##  Pearson's product-moment correlation
## 
## data:  Supersport$Sales and Supersport$Profit
## t = 54.555, df = 9992, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4638148 0.4940301
## sample estimates:
##       cor 
## 0.4790643
cor.test(Supersport$Profit, Supersport$Discount) 
## 
##  Pearson's product-moment correlation
## 
## data:  Supersport$Profit and Supersport$Discount
## t = -22.488, df = 9992, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2380689 -0.2007454
## sample estimates:
##        cor 
## -0.2194875
cor.test(Supersport$Profit, Supersport$Quantity)
## 
##  Pearson's product-moment correlation
## 
## data:  Supersport$Profit and Supersport$Quantity
## t = 6.6373, df = 9992, p-value = 3.362e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04670791 0.08574776
## sample estimates:
##        cor 
## 0.06625319
cor.test(Supersport$Discount, Supersport$Quantity) 
## 
##  Pearson's product-moment correlation
## 
## data:  Supersport$Discount and Supersport$Quantity
## t = 0.86198, df = 9992, p-value = 0.3887
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01098484  0.02822415
## sample estimates:
##        cor 
## 0.00862297
dfSummary(Supersport%>%select(where(is.numeric)))
## Data Frame Summary  
## Supersport  
## Dimensions: 9994 x 6  
## Duplicates: 0  
## 
## --------------------------------------------------------------------------------------------------------------------
## No   Variable      Stats / Values                  Freqs (% of Valid)     Graph                 Valid      Missing  
## ---- ------------- ------------------------------- ---------------------- --------------------- ---------- ---------
## 1    Row ID        Mean (sd) : 4997.5 (2885.2)     9994 distinct values   : : : : : : : : : :   9994       0        
##      [numeric]     min < med < max:                                       : : : : : : : : : :   (100.0%)   (0.0%)   
##                    1 < 4997.5 < 9994                                      : : : : : : : : : :                       
##                    IQR (CV) : 4996.5 (0.6)                                : : : : : : : : : :                       
##                                                                           : : : : : : : : : :                       
## 
## 2    Postal Code   Mean (sd) : 55190.4 (32063.7)   631 distinct values                      :   9994       0        
##      [numeric]     min < med < max:                                                         :   (100.0%)   (0.0%)   
##                    1040 < 56430.5 < 99301                                 .                 :                       
##                    IQR (CV) : 66785 (0.6)                                 : . . . :     :   :                       
##                                                                           : : : : : . : : : :                       
## 
## 3    Sales         Mean (sd) : 229.9 (623.2)       5825 distinct values   :                     9994       0        
##      [numeric]     min < med < max:                                       :                     (100.0%)   (0.0%)   
##                    0.4 < 54.5 < 22638.5                                   :                                         
##                    IQR (CV) : 192.7 (2.7)                                 :                                         
##                                                                           :                                         
## 
## 4    Quantity      Mean (sd) : 3.8 (2.2)           14 distinct values     :                     9994       0        
##      [numeric]     min < med < max:                                       : .                   (100.0%)   (0.0%)   
##                    1 < 3 < 14                                             : :   .                                   
##                    IQR (CV) : 3 (0.6)                                     : : : :                                   
##                                                                           : : : : : . .                             
## 
## 5    Discount      Mean (sd) : 0.2 (0.2)           12 distinct values     :                     9994       0        
##      [numeric]     min < med < max:                                       :   :                 (100.0%)   (0.0%)   
##                    0 < 0.2 < 0.8                                          :   :                                     
##                    IQR (CV) : 0.2 (1.3)                                   :   :                                     
##                                                                           :   : .         . .                       
## 
## 6    Profit        Mean (sd) : 28.7 (234.3)        7314 distinct values           :             9994       0        
##      [numeric]     min < med < max:                                               :             (100.0%)   (0.0%)   
##                    -6600 < 8.7 < 8400                                             :                                 
##                    IQR (CV) : 27.6 (8.2)                                          :                                 
##                                                                                   :                                 
## --------------------------------------------------------------------------------------------------------------------
summary(Supersport%>%select(where(is.numeric))) 
##      Row ID      Postal Code        Sales              Quantity    
##  Min.   :   1   Min.   : 1040   Min.   :    0.444   Min.   : 1.00  
##  1st Qu.:2499   1st Qu.:23223   1st Qu.:   17.280   1st Qu.: 2.00  
##  Median :4998   Median :56431   Median :   54.490   Median : 3.00  
##  Mean   :4998   Mean   :55190   Mean   :  229.858   Mean   : 3.79  
##  3rd Qu.:7496   3rd Qu.:90008   3rd Qu.:  209.940   3rd Qu.: 5.00  
##  Max.   :9994   Max.   :99301   Max.   :22638.480   Max.   :14.00  
##     Discount          Profit         
##  Min.   :0.0000   Min.   :-6599.978  
##  1st Qu.:0.0000   1st Qu.:    1.729  
##  Median :0.2000   Median :    8.666  
##  Mean   :0.1562   Mean   :   28.657  
##  3rd Qu.:0.2000   3rd Qu.:   29.364  
##  Max.   :0.8000   Max.   : 8399.976
sum(Supersport$Sales)
## [1] 2297201
sum(Supersport$Profit) 
## [1] 286397
sum(Supersport$Quantity)
## [1] 37873
sum(Supersport$Discount)
## [1] 1561.09
ship_mode <- Supersport%>%group_by(`Ship Mode`)%>%summarize(sum_sales = sum(Sales), sum_profit=sum(Profit), sum_quantity = sum(Quantity), sum_discount=sum(Discount))

7 Bar chart for sales by ship mode

ggplot(ship_mode, aes(x = `Ship Mode`, y = sum_sales)) + geom_bar(stat = "identity", fill = "lightblue", color = "black") + labs(title = "Total Sales by Ship Mode", x = " ", y = " ",  x = "Ship Mode", y = "Total Sales") + theme_minimal() + theme( axis.text.x = element_text(face = "bold"),  axis.text.y = element_text(face = "bold"),  axis.title = element_text(face = "bold"),  plot.title = element_text(face = "bold")) 

7.1 Time Series Analysis

7.1.1 Time Series Forecasting with Prophet

Using Prophet for time series forecasting based on sales data.

library(prophet)
df <- Supersport   
daily_sales <- df %>%   group_by(`Order Date`) %>%   summarize(Sales = sum(Sales))
daily_sales <- daily_sales %>%   rename(ds = `Order Date`, y = Sales) 
model <- prophet(daily_sales)

# Create future dates for forecasting

future <- make_future_dataframe(model, periods = 365) 
forecast <- predict(model, future) 
plot(model, forecast) 

prophet_plot_components(model, forecast)  

8 ARIMA AND SARIMA

library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
daily_sales <- df %>% group_by(`Order Date`) %>% summarize(Sales = sum(Sales))

ts_sales <- ts(daily_sales$Sales, frequency = 365) 
fit <- auto.arima(ts_sales)

forecast_arima <- forecast(fit, h = 365)

plot(forecast_arima)

9 Fit the SARIMA model

fit_sarima <- auto.arima(ts_sales, seasonal = TRUE)

forecast_sarima <- forecast(fit_sarima, h = 365) # Forecast for the next year

plot(forecast_sarima)

9.1 Conclusion

This analysis provided insights into the Superstore dataset, focusing on sales trends, customer purchase patterns, and time series forecasting using Prophet and ARIMA models.