Load the library

library(tidyverse)

As well as the data. I turned the original .xlsx file into two separate .csv sheets (orders.csv and returns.csv) since I was having issues reading the .xlsx file in R. I have also sent the two csv files in the email in case you face issues with R markdown.

my_data <- read_csv("C:\\Users\\A.Sharma\\Desktop\\Reports\\orders.csv")

1. What is the average sales on the west coast?

my_data %>% filter(Region == "West") %>% summarise(AvgSalesWestCoast = mean(Sales)) 
## # A tibble: 1 x 1
##   AvgSalesWestCoast
##               <dbl>
## 1              226.

2. Number of customers per category per region;

my_data %>% group_by(Category, Region) %>% count(name = "No. of Customers") 
## # A tibble: 12 x 3
## # Groups:   Category, Region [12]
##    Category        Region  `No. of Customers`
##    <chr>           <chr>                <int>
##  1 Furniture       Central                481
##  2 Furniture       East                   601
##  3 Furniture       South                  332
##  4 Furniture       West                   707
##  5 Office Supplies Central               1422
##  6 Office Supplies East                  1712
##  7 Office Supplies South                  995
##  8 Office Supplies West                  1897
##  9 Technology      Central                420
## 10 Technology      East                   535
## 11 Technology      South                  293
## 12 Technology      West                   599

3. Which state has the largest increase in profit from Q1-2016 relative to Q1-2015;

NOTE: I based quarters on order date and NOT shipping date. Then I proceeded to add a column for year and quarter to my_data

my_data <- my_data %>% mutate(Year = format(as.Date(my_data$`Order Date`, format="%m/%d/%Y"),"%Y")) %>% 
  mutate(Quarter = quarters(as.Date(`Order Date`, format="%m/%d/%Y"))) 

Then, I made a dataframe that only had data from the 1st quarter of 2015 (Q1-2015)

Q1_2015 <- my_data %>% filter(Year == "2015", Quarter == "Q1") 

Then, using the above dataframe, I made another dataframe which to calculate the cumulative profit of each state in Q1_2015

StateProfit_Q12015 <- Q1_2015 %>% group_by(State) %>% summarise(CumProfit = sum(Profit, na.rm=TRUE))
StateProfit_Q12015
## # A tibble: 32 x 2
##    State      CumProfit
##    <chr>          <dbl>
##  1 Arizona      -300.  
##  2 Arkansas      437.  
##  3 California   1610.  
##  4 Colorado       -6.55
##  5 Delaware        3.08
##  6 Florida     -1557.  
##  7 Georgia        71.2 
##  8 Illinois      -93.3 
##  9 Indiana        35.9 
## 10 Iowa          110.  
## # ... with 22 more rows

Then I repeated the same process for the 1st quarter of 2016

Q1_2016 <- my_data %>% filter(Year == "2016", Quarter == "Q1") 
StateProfit_Q12016 <- Q1_2016 %>% group_by(State) %>% summarise(CumProfit = sum(Profit, na.rm=TRUE))
StateProfit_Q12016
## # A tibble: 33 x 2
##    State      CumProfit
##    <chr>          <dbl>
##  1 Alabama       250.  
##  2 Arkansas        3.00
##  3 California   2273.  
##  4 Colorado        8.79
##  5 Delaware      777.  
##  6 Florida       254.  
##  7 Georgia      3259.  
##  8 Illinois     -841.  
##  9 Indiana        76.7 
## 10 Kansas          7.82
## # ... with 23 more rows

We can now find the relative profit increase for each state from Q1-2015 to Q1-2016 by merging the two dataframes shown above

Q1_1516 <- merge(StateProfit_Q12016, StateProfit_Q12015, by="State", all = TRUE, suffix = c("2016", "2015"))

and then arranging the data in descending order of cumulative profit to find the state that had the highest relative profit increase

Q1_1516State <- Q1_1516%>% mutate(RelativeProf = ifelse(is.na(Q1_1516$CumProfit2016),0, Q1_1516$CumProfit2016) - ifelse(is.na(Q1_1516$CumProfit2015),0,Q1_1516$CumProfit2015)) %>% 
  arrange(desc(RelativeProf))
Q1_1516State
##             State CumProfit2016 CumProfit2015 RelativeProf
## 1         Georgia     3258.9156       71.2178    3187.6978
## 2      Washington     2944.0245      360.7111    2583.3134
## 3         Florida      254.0293    -1556.5105    1810.5398
## 4        New York      815.8103     -530.6330    1346.4433
## 5        Maryland     1296.1396            NA    1296.1396
## 6        Delaware      777.4231        3.0814     774.3417
## 7       Wisconsin      908.9740      206.3160     702.6580
## 8      California     2272.7948     1610.3104     662.4844
## 9   New Hampshire      339.5570            NA     339.5570
## 10        Arizona            NA     -300.1272     300.1272
## 11        Alabama      249.7311            NA     249.7311
## 12       Nebraska       62.0658            NA      62.0658
## 13          Texas     -529.8081     -591.3309      61.5228
## 14      Tennessee       25.5744      -33.6438      59.2182
## 15        Indiana       76.6655       35.8524      40.8131
## 16     New Jersey       74.5654       49.6048      24.9606
## 17        Montana       20.8106            NA      20.8106
## 18       Colorado        8.7880       -6.5538      15.3418
## 19  Massachusetts       13.7400            NA      13.7400
## 20         Kansas        7.8221            NA       7.8221
## 21     New Mexico       23.0864       22.6782       0.4082
## 22         Oregon        2.5641       25.2757     -22.7116
## 23         Nevada            NA       24.7712     -24.7712
## 24   Rhode Island      -27.1642            NA     -27.1642
## 25      Minnesota       74.3798      114.6568     -40.2770
## 26           Utah        5.7762       46.9632     -41.1870
## 27    Mississippi            NA       42.8123     -42.8123
## 28   South Dakota            NA       44.7684     -44.7684
## 29 South Carolina       14.7604      105.0954     -90.3350
## 30   Pennsylvania      -77.1917       19.4213     -96.6130
## 31       Missouri            NA      106.7911    -106.7911
## 32           Iowa            NA      109.5012    -109.5012
## 33       Michigan       20.6336      205.9472    -185.3136
## 34      Louisiana            NA      326.5672    -326.5672
## 35       Arkansas        2.9990      437.2475    -434.2485
## 36       Illinois     -841.1898      -93.2520    -747.9378
## 37           Ohio    -1565.6771     -327.2528   -1238.4243
## 38       Kentucky      131.0797     1567.4652   -1436.3855
## 39       Virginia      239.6253     1729.7589   -1490.1336
## 40 North Carolina    -1616.3631      -16.2817   -1600.0814

4. Shows the number of orders returned by region;

I begin with creating a .csv file of all returned orders. Then, I select all returned orders based on unique Order IDs

my_data_returns <- read_csv("C:\\Users\\A.Sharma\\Desktop\\Reports\\returns.csv")
my_data_orderid <- unique(my_data_returns[c("Order ID")])

Then, for the sake of convenience and brevity, we from the orders.csv data, we make a new dataframe containing only the Region and Order ID from orders.csv

my_data_region <- my_data %>% select(Region, `Order ID`) 

Then, we merge the above two dataframes based on order IDs and group by region to see a count of the number of orders that were returned per region

Returned_Orders <- merge(x = my_data_region, y = my_data_orderid, by = "Order ID", all.y = TRUE) %>% group_by(Region) %>% count(name = "No.Of Returns")
Returned_Orders
## # A tibble: 5 x 2
## # Groups:   Region [5]
##   Region  `No.Of Returns`
##   <chr>             <int>
## 1 Central              92
## 2 East                149
## 3 South                69
## 4 West                490
## 5 <NA>                  1

5. The average number of days between order date and ship date in each state;

To see the average number of days between order and shipping date per state, I first make a new column called delay by simply subtracting order date column from the ship date column to find the number of days per delay

Delay <- my_data %>% mutate(Delay = as.Date(my_data$`Ship Date`, format = "%m/%d/%Y") - as.Date(my_data$`Order Date`, format = "%m/%d/%Y"))  

Grouping the above dataframe by state and taking the mean ocross delay shows the number of days between order and shipping date per state as follows

Delay %>% group_by(State) %>% summarise(mean(Delay))
## # A tibble: 49 x 2
##    State                `mean(Delay)`
##    <chr>                <drtn>       
##  1 Alabama              4.114754 days
##  2 Arizona              4.071429 days
##  3 Arkansas             4.133333 days
##  4 California           3.867066 days
##  5 Colorado             3.681319 days
##  6 Connecticut          3.597561 days
##  7 Delaware             4.270833 days
##  8 District of Columbia 5.700000 days
##  9 Florida              3.947781 days
## 10 Georgia              3.836957 days
## # ... with 39 more rows

6. A Forecast of orders

NOTE: To make a forecast of orders, I did a time-series analysis of the number of orders places per month between 2015 and 2018. In order to do so, I first extract the year and month/year from the order date column and add them to a new dataframe.

allsales <- my_data %>%  mutate(MY = format(as.Date(my_data$`Order Date`, format="%m/%d/%Y"),"%Y/%m")) %>% 
  mutate(Year = format(as.Date(my_data$`Order Date`, format="%m/%d/%Y"),"%Y"))

Then, I take a count of the orders placed every month between 2015 and 2018

allmsales <- allsales %>% group_by(Year, MY) %>% count(name = "Orders")

Then, I add a column for time to the monthly sales dataframe such that I can perform a time series analysis

FixedMSales <- allmsales %>% rename(time = MY) %>% mutate(time = as.Date(parse_date(time, "%Y/%m"))) %>% na.omit()

I will now use the above dataframe to make a plot and forecast the monthly sales from 2015-2018 which look as such

ggplot(FixedMSales, aes(x = time, y = Orders)) +
  geom_line(col = "hotpink") + 
  scale_x_date(date_labels = "%y %b", date_breaks = "2 month") +
  theme_bw() + theme(legend.title = element_blank(),
                     axis.text.x  = element_text(angle=45, vjust=0.5))

This plot shows that orders peak between August and October of each year (which makes sense since it is the end of summer break in the northern hemisphere. Furthermore, it is also the start of new academic year as well as working months in America. Both reasons could explain the spike). The orders fall to their lowest between January and February of each year which can be explained by the end of winter break and the half-way point of academic years.

To see whether the overall trend is rising or falling, I made a frequency plot and as I thought, I noticed an overall rising trend throughout the years

ggplot(data = my_data, mapping = aes(x = as.Date(my_data$`Order Date`, format = "%m/%d/%Y"))) +
  geom_freqpoly() + xlab("Time") + ylab("Number of Orders") 

Now, to start with the forecasting, we first need to install the forecast library

library(forecast)

The forecast package in R contains a very useful function called auto.arima which helps us select the best ARIMA model.

The first plot shows the autocorrelations. Each observation seems to be fairly correlated with the previous 2-3 observations.

I will use this model to predict one month at the time. In order to evaluate the performance of the model we split the dataset into a training set (with for example the first 30 observations), we fit the model (that will probably look like the one above) and then predict the 31st observation. Then we run the model on the first 31 observations, predict the 32nd and so on. If this works, then we may be quite sure that if we predict the 49th observation, we will not make a much larger error.

So first we split the dataset, allowing for a varying index:

train_index <- 30
n_total <- nrow(FixedMSales)
FixedMSales_train1 <- FixedMSales[1:(train_index),]
FixedMSales_test <- FixedMSales[(train_index+1):n_total,]
predicted <- numeric(n_total-train_index)

Then we apply a cycle that iterates model and estimates one month ahead with each cycle

for (i in 1:(n_total-train_index)) {
  FixedMSales_train <- FixedMSales[1:(train_index-1+i),]
  arima_model <- auto.arima(as.ts(FixedMSales_train$Orders))
  pred <- forecast(arima_model, 1)
  predicted[i] <- pred$mean
}

Now, I will build a dataframe that contains the original and predicted values for each month each year

df_pred <- tibble(obs = c(FixedMSales_train1$Orders, FixedMSales_test$Orders), 
                  predicted = c(FixedMSales_train1$Orders, predicted), 
                  time = FixedMSales$time)

And finally, by using the above dataframe, I make a plot showing the predicted and observed orders for each month

df_pred <- tibble(obs = c(FixedMSales_train1$Orders, FixedMSales_test$Orders), 
                  predicted = c(FixedMSales_train1$Orders, predicted), 
                  time = FixedMSales$time) 

This dataframe is then used to plot the observed and predicted values for the number of orders placed every month between 2015 and 2018

ggplot(gather(df_pred, obs_pred, value, -time) %>% 
         mutate(obs_pred = factor(obs_pred, levels = c("predicted", "obs"))), 
       aes(x = time, y = value, col = obs_pred, linetype = obs_pred)) +
  geom_line() +
  xlab("") + ylab("") +
  scale_color_manual(values=c("black", "hotpink")) +
  scale_linetype_manual(values=c(2, 1)) +
  scale_x_date(date_labels = "%Y %b", date_breaks = "2 month") +
  theme_bw() + theme(legend.title = element_blank(),
                     axis.text.x  = element_text(angle=45, vjust=0.5))

7. Points of Attention to Guarantee Data Quality

I have 7 points of attention to ensure data quality

  1. Accuracy: Ensure that whatever data I have is correct and free of any errors like typos.
  2. Completeness: Make sure that the dataset is complete and does not have any missing values. If there are any missing values, it is very important to either fill them in with correct data or deal with the in the right manner.
  3. Relevance: Ensure that the data I have will help me solve the problems at hand.
  4. Reliability: In my opinion, the data should be clear at first sight and should not include any misleading or vague information. If some information is vague, it should be fixed to make it more usable
  5. Time-appropriarte: Whilst old data is helpful at times, it is also important (in my opinion) to ensure that the data is not too outdate and updated frequently such that reliable predictions can be made
  6. Consistency: The data should have the same format throughout such that cross-referncing is possible. Most data however is quite inconsistent so it is important to take care of the data format when doing transformative work.
  7. Source: Last but definitely not the least, the data should come from a direct and reliable source. I believe meddling of data by other 3rd parties only further degrades its value.

8. Which data relations would you like to discover more and why?

  1. Best Selling Subcategory: I would like to see which item from the subcategory of items sold the most over time. Knowing which item sells the most is useful for both the producer and consumer as it helps in setting the right prices which in turn helps maximize producer profit and consumer surplus! In fact, I did some analysis and found that binders were the best-selling item for this business over the entire period.
Sales <- my_data %>% group_by(`Sub-Category`) %>% count() %>%  arrange(desc(n)) 
Sales
## # A tibble: 17 x 2
## # Groups:   Sub-Category [17]
##    `Sub-Category`     n
##    <chr>          <int>
##  1 Binders         1523
##  2 Paper           1370
##  3 Furnishings      957
##  4 Phones           889
##  5 Storage          846
##  6 Art              796
##  7 Accessories      775
##  8 Chairs           617
##  9 Appliances       466
## 10 Labels           364
## 11 Tables           319
## 12 Envelopes        254
## 13 Bookcases        228
## 14 Fasteners        217
## 15 Supplies         190
## 16 Machines         115
## 17 Copiers           68

As you can see, this business sold 1,523 binders over 4 years which happens to be their highest number of orders placed for any subcategory.

  1. Subcategory Sales Trend: In addition to finding the best selling item over time, it also helps to see which items are starting to gain and/or lose popularity over time. This trend analysis could potentially help businesses manage their inventory in the most optimal way possible. Once again, I did a quick analysis and made a plot of the number of orders made for each subcategory over time.
ggplot(data = my_data, mapping = aes(x = as.Date(my_data$`Order Date`, format = "%m/%d/%Y"),  color = `Sub-Category`)) +
  geom_freqpoly() + xlab("Time") + ylab("Number of Orders") 

This graph in and of itself is quite descriptive however – for colorblind people like me – this graph is an absolute nightmare! Hence, I made separate graphs for each subcategory for a neater overview

ggplot(data = my_data, mapping = aes(x = as.Date(my_data$`Order Date`, format = "%m/%d/%Y"),  color = `Sub-Category`)) +
  geom_freqpoly() +  facet_wrap(~  `Sub-Category`, nrow = 3)+ xlab("Time") + ylab("Number of Orders") 

From these graphs, we can conclude that even though binders are the best selling item overall, paper sales are starting to gain popularity. This is potential useful information for the online vendor as they can increase stocks and offer bundle prices on popular items to increase profits.

  1. Discount Pricing: I would’ve liked to explore which subcategory as well as which segment got the most discount and how does the discounted price compare to the original price based on the number of items ordered. Ideally, I would make a graph showing the discounted price vs. the original price for each item. I would also make a graph to see how much discount each customer got. Unfortunately I did not have emough time to explore this but perhaps in the future!

  2. Sales Prediction: I would’ve liked to forecast the orders per subcategory and per product. Furthermore, I would’ve also liked to predict the orders for each product in the near future as it helps in keeping the business prepared for demand and/or supply shocks. Unfortunately once again, I did not have time to go over this but I would love to explore this avenue in the future!

9. Which data would you like to request more and why?

I would like to have more data from the following fields

  1. Return Date: The returns sheet only has the Order ID of which item was returned but not when it was returned. I think it would be useful to have this information to then see the average number of days between shipping and return.

  2. Feedback on Orders: The eCommerce should send out a feedback form with each successful order to see how the consumer perceived their services. Based on these product reviews, the business can understand how its customers feel about their service, and gain insights that lead to good data-driven decisions.

  3. People Sheet: Perhaps this is an issue on my end but the “people” sheet in the excel file sent to me consists just of 5 rows and 4 columns. It would be nice to have a more in-depth sheet of the people who placed orders such that the eCommerce could give them personalized item suggestions and discounts.

10. Some Exciting Extra Information

Using the smooth and forecast library, I calculated a Simple Moving Average to predict the trend of monthly sales across all items for the next two years

library(smooth)
contributions <- FixedMSales$Orders
contributions.ts <- ts(contributions, frequency = 12, start = c(2015, 2))

I used a frequency of 12 since I want the model to take values from every one year

# Use sma() to forecast number of orders placed 2020 and 2021
contributions.fc <- sma(contributions.ts, order=12, h=24,silent=FALSE)

# Print model summary
summary(contributions.fc)
## Time elapsed: 0.09 seconds
## Model estimated: SMA(12)
## Initial values were produced using backcasting.
## 
## Loss function type: MSE; Loss function value: 8430.8293
## Error standard deviation: 93.7944
## Sample size: 48
## Number of estimated parameters: 2
## Number of degrees of freedom: 46
## Information criteria:
##      AIC     AICc      BIC     BICc 
## 574.1213 574.3880 577.8637 578.3799
# Print the forecasts
fc <- forecast(contributions.fc)
print(fc)
##          Point forecast Lower bound (2.5%) Upper bound (97.5%)
## Feb 2019       276.0000           87.20169            464.7983
## Mar 2019       286.0833           96.63061            475.5361
## Apr 2019       301.0069          110.78906            491.2248
## May 2019       306.2575          115.14555            497.3695
## Jun 2019       314.8623          122.70634            507.0183
## Jul 2019       320.9342          127.56013            514.3082
## Aug 2019       327.2620          132.46815            522.0559
## Sep 2019       335.7005          139.25341            532.1476
## Oct 2019       345.5089          147.13911            543.8787
## Nov 2019       336.0513          135.44854            536.6541

SMA(12) is the right model for this dataset as it shows that after 2019, the number of orders placed will go up and peak around August of 2019 and then start fall again in December 2019 until February 2020. This trend is similar to what has been observed in the previous years from 2015-2018 (see graph 1). Furthermore, this plot also predicts that sales will ever-so-slightly increase in the beginning of 2020 but then plateau and then ever-so-slightly fall again at the end of 2020 just as the previous years.