Load the library
library(tidyverse)
As well as the data. I turned the original .xlsx file into two separate .csv sheets (orders.csv and returns.csv) since I was having issues reading the .xlsx file in R. I have also sent the two csv files in the email in case you face issues with R markdown.
my_data <- read_csv("C:\\Users\\A.Sharma\\Desktop\\Reports\\orders.csv")
my_data %>% filter(Region == "West") %>% summarise(AvgSalesWestCoast = mean(Sales))
## # A tibble: 1 x 1
## AvgSalesWestCoast
## <dbl>
## 1 226.
my_data %>% group_by(Category, Region) %>% count(name = "No. of Customers")
## # A tibble: 12 x 3
## # Groups: Category, Region [12]
## Category Region `No. of Customers`
## <chr> <chr> <int>
## 1 Furniture Central 481
## 2 Furniture East 601
## 3 Furniture South 332
## 4 Furniture West 707
## 5 Office Supplies Central 1422
## 6 Office Supplies East 1712
## 7 Office Supplies South 995
## 8 Office Supplies West 1897
## 9 Technology Central 420
## 10 Technology East 535
## 11 Technology South 293
## 12 Technology West 599
NOTE: I based quarters on order date and NOT shipping date. Then I proceeded to add a column for year and quarter to my_data
my_data <- my_data %>% mutate(Year = format(as.Date(my_data$`Order Date`, format="%m/%d/%Y"),"%Y")) %>%
mutate(Quarter = quarters(as.Date(`Order Date`, format="%m/%d/%Y")))
Then, I made a dataframe that only had data from the 1st quarter of 2015 (Q1-2015)
Q1_2015 <- my_data %>% filter(Year == "2015", Quarter == "Q1")
Then, using the above dataframe, I made another dataframe which to calculate the cumulative profit of each state in Q1_2015
StateProfit_Q12015 <- Q1_2015 %>% group_by(State) %>% summarise(CumProfit = sum(Profit, na.rm=TRUE))
StateProfit_Q12015
## # A tibble: 32 x 2
## State CumProfit
## <chr> <dbl>
## 1 Arizona -300.
## 2 Arkansas 437.
## 3 California 1610.
## 4 Colorado -6.55
## 5 Delaware 3.08
## 6 Florida -1557.
## 7 Georgia 71.2
## 8 Illinois -93.3
## 9 Indiana 35.9
## 10 Iowa 110.
## # ... with 22 more rows
Then I repeated the same process for the 1st quarter of 2016
Q1_2016 <- my_data %>% filter(Year == "2016", Quarter == "Q1")
StateProfit_Q12016 <- Q1_2016 %>% group_by(State) %>% summarise(CumProfit = sum(Profit, na.rm=TRUE))
StateProfit_Q12016
## # A tibble: 33 x 2
## State CumProfit
## <chr> <dbl>
## 1 Alabama 250.
## 2 Arkansas 3.00
## 3 California 2273.
## 4 Colorado 8.79
## 5 Delaware 777.
## 6 Florida 254.
## 7 Georgia 3259.
## 8 Illinois -841.
## 9 Indiana 76.7
## 10 Kansas 7.82
## # ... with 23 more rows
We can now find the relative profit increase for each state from Q1-2015 to Q1-2016 by merging the two dataframes shown above
Q1_1516 <- merge(StateProfit_Q12016, StateProfit_Q12015, by="State", all = TRUE, suffix = c("2016", "2015"))
and then arranging the data in descending order of cumulative profit to find the state that had the highest relative profit increase
Q1_1516State <- Q1_1516%>% mutate(RelativeProf = ifelse(is.na(Q1_1516$CumProfit2016),0, Q1_1516$CumProfit2016) - ifelse(is.na(Q1_1516$CumProfit2015),0,Q1_1516$CumProfit2015)) %>%
arrange(desc(RelativeProf))
Q1_1516State
## State CumProfit2016 CumProfit2015 RelativeProf
## 1 Georgia 3258.9156 71.2178 3187.6978
## 2 Washington 2944.0245 360.7111 2583.3134
## 3 Florida 254.0293 -1556.5105 1810.5398
## 4 New York 815.8103 -530.6330 1346.4433
## 5 Maryland 1296.1396 NA 1296.1396
## 6 Delaware 777.4231 3.0814 774.3417
## 7 Wisconsin 908.9740 206.3160 702.6580
## 8 California 2272.7948 1610.3104 662.4844
## 9 New Hampshire 339.5570 NA 339.5570
## 10 Arizona NA -300.1272 300.1272
## 11 Alabama 249.7311 NA 249.7311
## 12 Nebraska 62.0658 NA 62.0658
## 13 Texas -529.8081 -591.3309 61.5228
## 14 Tennessee 25.5744 -33.6438 59.2182
## 15 Indiana 76.6655 35.8524 40.8131
## 16 New Jersey 74.5654 49.6048 24.9606
## 17 Montana 20.8106 NA 20.8106
## 18 Colorado 8.7880 -6.5538 15.3418
## 19 Massachusetts 13.7400 NA 13.7400
## 20 Kansas 7.8221 NA 7.8221
## 21 New Mexico 23.0864 22.6782 0.4082
## 22 Oregon 2.5641 25.2757 -22.7116
## 23 Nevada NA 24.7712 -24.7712
## 24 Rhode Island -27.1642 NA -27.1642
## 25 Minnesota 74.3798 114.6568 -40.2770
## 26 Utah 5.7762 46.9632 -41.1870
## 27 Mississippi NA 42.8123 -42.8123
## 28 South Dakota NA 44.7684 -44.7684
## 29 South Carolina 14.7604 105.0954 -90.3350
## 30 Pennsylvania -77.1917 19.4213 -96.6130
## 31 Missouri NA 106.7911 -106.7911
## 32 Iowa NA 109.5012 -109.5012
## 33 Michigan 20.6336 205.9472 -185.3136
## 34 Louisiana NA 326.5672 -326.5672
## 35 Arkansas 2.9990 437.2475 -434.2485
## 36 Illinois -841.1898 -93.2520 -747.9378
## 37 Ohio -1565.6771 -327.2528 -1238.4243
## 38 Kentucky 131.0797 1567.4652 -1436.3855
## 39 Virginia 239.6253 1729.7589 -1490.1336
## 40 North Carolina -1616.3631 -16.2817 -1600.0814
I begin with creating a .csv file of all returned orders. Then, I select all returned orders based on unique Order IDs
my_data_returns <- read_csv("C:\\Users\\A.Sharma\\Desktop\\Reports\\returns.csv")
my_data_orderid <- unique(my_data_returns[c("Order ID")])
Then, for the sake of convenience and brevity, we from the orders.csv data, we make a new dataframe containing only the Region and Order ID from orders.csv
my_data_region <- my_data %>% select(Region, `Order ID`)
Then, we merge the above two dataframes based on order IDs and group by region to see a count of the number of orders that were returned per region
Returned_Orders <- merge(x = my_data_region, y = my_data_orderid, by = "Order ID", all.y = TRUE) %>% group_by(Region) %>% count(name = "No.Of Returns")
Returned_Orders
## # A tibble: 5 x 2
## # Groups: Region [5]
## Region `No.Of Returns`
## <chr> <int>
## 1 Central 92
## 2 East 149
## 3 South 69
## 4 West 490
## 5 <NA> 1
To see the average number of days between order and shipping date per state, I first make a new column called delay by simply subtracting order date column from the ship date column to find the number of days per delay
Delay <- my_data %>% mutate(Delay = as.Date(my_data$`Ship Date`, format = "%m/%d/%Y") - as.Date(my_data$`Order Date`, format = "%m/%d/%Y"))
Grouping the above dataframe by state and taking the mean ocross delay shows the number of days between order and shipping date per state as follows
Delay %>% group_by(State) %>% summarise(mean(Delay))
## # A tibble: 49 x 2
## State `mean(Delay)`
## <chr> <drtn>
## 1 Alabama 4.114754 days
## 2 Arizona 4.071429 days
## 3 Arkansas 4.133333 days
## 4 California 3.867066 days
## 5 Colorado 3.681319 days
## 6 Connecticut 3.597561 days
## 7 Delaware 4.270833 days
## 8 District of Columbia 5.700000 days
## 9 Florida 3.947781 days
## 10 Georgia 3.836957 days
## # ... with 39 more rows
NOTE: To make a forecast of orders, I did a time-series analysis of the number of orders places per month between 2015 and 2018. In order to do so, I first extract the year and month/year from the order date column and add them to a new dataframe.
allsales <- my_data %>% mutate(MY = format(as.Date(my_data$`Order Date`, format="%m/%d/%Y"),"%Y/%m")) %>%
mutate(Year = format(as.Date(my_data$`Order Date`, format="%m/%d/%Y"),"%Y"))
Then, I take a count of the orders placed every month between 2015 and 2018
allmsales <- allsales %>% group_by(Year, MY) %>% count(name = "Orders")
Then, I add a column for time to the monthly sales dataframe such that I can perform a time series analysis
FixedMSales <- allmsales %>% rename(time = MY) %>% mutate(time = as.Date(parse_date(time, "%Y/%m"))) %>% na.omit()
I will now use the above dataframe to make a plot and forecast the monthly sales from 2015-2018 which look as such
ggplot(FixedMSales, aes(x = time, y = Orders)) +
geom_line(col = "hotpink") +
scale_x_date(date_labels = "%y %b", date_breaks = "2 month") +
theme_bw() + theme(legend.title = element_blank(),
axis.text.x = element_text(angle=45, vjust=0.5))
This plot shows that orders peak between August and October of each year (which makes sense since it is the end of summer break in the northern hemisphere. Furthermore, it is also the start of new academic year as well as working months in America. Both reasons could explain the spike). The orders fall to their lowest between January and February of each year which can be explained by the end of winter break and the half-way point of academic years.
To see whether the overall trend is rising or falling, I made a frequency plot and as I thought, I noticed an overall rising trend throughout the years
ggplot(data = my_data, mapping = aes(x = as.Date(my_data$`Order Date`, format = "%m/%d/%Y"))) +
geom_freqpoly() + xlab("Time") + ylab("Number of Orders")
Now, to start with the forecasting, we first need to install the forecast library
library(forecast)
The forecast package in R contains a very useful function called auto.arima which helps us select the best ARIMA model.
The first plot shows the autocorrelations. Each observation seems to be fairly correlated with the previous 2-3 observations.
I will use this model to predict one month at the time. In order to evaluate the performance of the model we split the dataset into a training set (with for example the first 30 observations), we fit the model (that will probably look like the one above) and then predict the 31st observation. Then we run the model on the first 31 observations, predict the 32nd and so on. If this works, then we may be quite sure that if we predict the 49th observation, we will not make a much larger error.
So first we split the dataset, allowing for a varying index:
train_index <- 30
n_total <- nrow(FixedMSales)
FixedMSales_train1 <- FixedMSales[1:(train_index),]
FixedMSales_test <- FixedMSales[(train_index+1):n_total,]
predicted <- numeric(n_total-train_index)
Then we apply a cycle that iterates model and estimates one month ahead with each cycle
for (i in 1:(n_total-train_index)) {
FixedMSales_train <- FixedMSales[1:(train_index-1+i),]
arima_model <- auto.arima(as.ts(FixedMSales_train$Orders))
pred <- forecast(arima_model, 1)
predicted[i] <- pred$mean
}
Now, I will build a dataframe that contains the original and predicted values for each month each year
df_pred <- tibble(obs = c(FixedMSales_train1$Orders, FixedMSales_test$Orders),
predicted = c(FixedMSales_train1$Orders, predicted),
time = FixedMSales$time)
And finally, by using the above dataframe, I make a plot showing the predicted and observed orders for each month
df_pred <- tibble(obs = c(FixedMSales_train1$Orders, FixedMSales_test$Orders),
predicted = c(FixedMSales_train1$Orders, predicted),
time = FixedMSales$time)
This dataframe is then used to plot the observed and predicted values for the number of orders placed every month between 2015 and 2018
ggplot(gather(df_pred, obs_pred, value, -time) %>%
mutate(obs_pred = factor(obs_pred, levels = c("predicted", "obs"))),
aes(x = time, y = value, col = obs_pred, linetype = obs_pred)) +
geom_line() +
xlab("") + ylab("") +
scale_color_manual(values=c("black", "hotpink")) +
scale_linetype_manual(values=c(2, 1)) +
scale_x_date(date_labels = "%Y %b", date_breaks = "2 month") +
theme_bw() + theme(legend.title = element_blank(),
axis.text.x = element_text(angle=45, vjust=0.5))
I have 7 points of attention to ensure data quality
Sales <- my_data %>% group_by(`Sub-Category`) %>% count() %>% arrange(desc(n))
Sales
## # A tibble: 17 x 2
## # Groups: Sub-Category [17]
## `Sub-Category` n
## <chr> <int>
## 1 Binders 1523
## 2 Paper 1370
## 3 Furnishings 957
## 4 Phones 889
## 5 Storage 846
## 6 Art 796
## 7 Accessories 775
## 8 Chairs 617
## 9 Appliances 466
## 10 Labels 364
## 11 Tables 319
## 12 Envelopes 254
## 13 Bookcases 228
## 14 Fasteners 217
## 15 Supplies 190
## 16 Machines 115
## 17 Copiers 68
As you can see, this business sold 1,523 binders over 4 years which happens to be their highest number of orders placed for any subcategory.
ggplot(data = my_data, mapping = aes(x = as.Date(my_data$`Order Date`, format = "%m/%d/%Y"), color = `Sub-Category`)) +
geom_freqpoly() + xlab("Time") + ylab("Number of Orders")
This graph in and of itself is quite descriptive however – for colorblind people like me – this graph is an absolute nightmare! Hence, I made separate graphs for each subcategory for a neater overview
ggplot(data = my_data, mapping = aes(x = as.Date(my_data$`Order Date`, format = "%m/%d/%Y"), color = `Sub-Category`)) +
geom_freqpoly() + facet_wrap(~ `Sub-Category`, nrow = 3)+ xlab("Time") + ylab("Number of Orders")
From these graphs, we can conclude that even though binders are the best selling item overall, paper sales are starting to gain popularity. This is potential useful information for the online vendor as they can increase stocks and offer bundle prices on popular items to increase profits.
Discount Pricing: I would’ve liked to explore which subcategory as well as which segment got the most discount and how does the discounted price compare to the original price based on the number of items ordered. Ideally, I would make a graph showing the discounted price vs. the original price for each item. I would also make a graph to see how much discount each customer got. Unfortunately I did not have emough time to explore this but perhaps in the future!
Sales Prediction: I would’ve liked to forecast the orders per subcategory and per product. Furthermore, I would’ve also liked to predict the orders for each product in the near future as it helps in keeping the business prepared for demand and/or supply shocks. Unfortunately once again, I did not have time to go over this but I would love to explore this avenue in the future!
I would like to have more data from the following fields
Return Date: The returns sheet only has the Order ID of which item was returned but not when it was returned. I think it would be useful to have this information to then see the average number of days between shipping and return.
Feedback on Orders: The eCommerce should send out a feedback form with each successful order to see how the consumer perceived their services. Based on these product reviews, the business can understand how its customers feel about their service, and gain insights that lead to good data-driven decisions.
People Sheet: Perhaps this is an issue on my end but the “people” sheet in the excel file sent to me consists just of 5 rows and 4 columns. It would be nice to have a more in-depth sheet of the people who placed orders such that the eCommerce could give them personalized item suggestions and discounts.
Using the smooth and forecast library, I calculated a Simple Moving Average to predict the trend of monthly sales across all items for the next two years
library(smooth)
contributions <- FixedMSales$Orders
contributions.ts <- ts(contributions, frequency = 12, start = c(2015, 2))
I used a frequency of 12 since I want the model to take values from every one year
# Use sma() to forecast number of orders placed 2020 and 2021
contributions.fc <- sma(contributions.ts, order=12, h=24,silent=FALSE)
# Print model summary
summary(contributions.fc)
## Time elapsed: 0.09 seconds
## Model estimated: SMA(12)
## Initial values were produced using backcasting.
##
## Loss function type: MSE; Loss function value: 8430.8293
## Error standard deviation: 93.7944
## Sample size: 48
## Number of estimated parameters: 2
## Number of degrees of freedom: 46
## Information criteria:
## AIC AICc BIC BICc
## 574.1213 574.3880 577.8637 578.3799
# Print the forecasts
fc <- forecast(contributions.fc)
print(fc)
## Point forecast Lower bound (2.5%) Upper bound (97.5%)
## Feb 2019 276.0000 87.20169 464.7983
## Mar 2019 286.0833 96.63061 475.5361
## Apr 2019 301.0069 110.78906 491.2248
## May 2019 306.2575 115.14555 497.3695
## Jun 2019 314.8623 122.70634 507.0183
## Jul 2019 320.9342 127.56013 514.3082
## Aug 2019 327.2620 132.46815 522.0559
## Sep 2019 335.7005 139.25341 532.1476
## Oct 2019 345.5089 147.13911 543.8787
## Nov 2019 336.0513 135.44854 536.6541
SMA(12) is the right model for this dataset as it shows that after 2019, the number of orders placed will go up and peak around August of 2019 and then start fall again in December 2019 until February 2020. This trend is similar to what has been observed in the previous years from 2015-2018 (see graph 1). Furthermore, this plot also predicts that sales will ever-so-slightly increase in the beginning of 2020 but then plateau and then ever-so-slightly fall again at the end of 2020 just as the previous years.