Start by loading the neccesary libraries

library(psych)
## Warning: package 'psych' was built under R version 4.2.3
library(xlsx)
## Warning: package 'xlsx' was built under R version 4.2.3
library(validate)
## Warning: package 'validate' was built under R version 4.2.3

Prepare input functions (mean for quantitative variables and mode for qualitative variables)

impute_by_mean <- function(impute_data) {   
for(i in 1:ncol(impute_data)){
  if(is.numeric(impute_data[[i]]) || is.double(impute_data[[i]]) ){
    na <- is.na(impute_data[[i]])
    impute_data[na, i] <- mean(impute_data[[i]], na.rm = TRUE) 
  }
  
}
  return (impute_data)
}


impute_by_mode <- function(impute_data) {   
for(i in 1:ncol(impute_data)){
  if(is.factor(impute_data[[i]]) ){
    na <- is.na(impute_data[[i]])
    impute_data[na, i] <-  i_mode(impute_data[[i]]) #check median as well
  }
}
  return (impute_data)
}

i_mode <- function(x) {                                     
  unique_x <- unique(x)
  mode <- unique_x[which.max(tabulate(match(x, unique_x)))]
  mode
}

We now start exploring the data

library(xlsx)
mydf <- read.xlsx("G:/My Drive/Job Hunting/IMS Interview/IMS_data.xlsx", 1) # the second input in the function correspond to the sheet # is being loaded

Describe our data

describe(mydf)
##              vars   n   mean    sd median trimmed   mad    min    max  range
## Date*           1 104  52.50 30.17  52.50   52.50 38.55   1.00 104.00 103.00
## Temperature     2 104  26.47 15.14  25.38   24.91  7.33  11.90 130.01 118.11
## Distribution    3  98   0.83  0.04   0.82    0.83  0.04   0.77   0.90   0.13
## Price           4 100   1.43  1.71   1.21    1.18  0.04   0.80  15.40  14.60
## TV.ads          5 104   2.30  5.35   0.00    1.24  0.00   0.00  40.00  40.00
## Sales.Volume    6  99 140.63  8.25 141.52  141.13  7.53 119.95 156.36  36.41
##               skew kurtosis   se
## Date*         0.00    -1.23 2.96
## Temperature   5.11    30.92 1.48
## Distribution  0.27    -1.10 0.00
## Price         6.72    47.34 0.17
## TV.ads        4.78    27.75 0.52
## Sales.Volume -0.52    -0.18 0.83

Except for TV.Ads, there are no significant differences between the mean of each variable and the trimmed mean (-2.5% top and bottom), suggesting that there are no extreme values in each variable

Our five variables (excluding Date) have low dispersion (mad is relatively common for all of them). Recall that MAD is the average distance of the data points to the median.

Trimmed and mad, together with mean and median, confirm that our dataset seems not to have outliers and its dispersion is low. Finally, the median and mean are similar, suggesting that our data is normally distributed.

We now deal with the issue of incomplete data (Na) and zeros. But before reaching any conclusion, we check our data.

(colMeans(is.na(mydf)))*100 
##         Date  Temperature Distribution        Price       TV.ads Sales.Volume 
##     0.000000     0.000000     5.769231     3.846154     0.000000     4.807692

What do we know about this data? It is a time series of two years: 2009 and 2010. Every entrance is a week of each year, 52 weeks for each year. For this dataset in particular, NA is incomplete or missing data (not zeros as in Tv. Ads), and missing data happens in a low proportion in 3 variables: Distribution (less than 6%), Price (less than 4%), and Sales.volume (less than 5%).

Because our idea is to create a predictive model (rather than just informing the peculiarities of the dataset) we decide to impute the NA for the mean of each column.

df <- impute_by_mean(mydf)
# we check the results after the impute function 
(colMeans(is.na(df)))*100 
##         Date  Temperature Distribution        Price       TV.ads Sales.Volume 
##            0            0            0            0            0            0

Our new dataframe (df) has no missing values. The zeros in the column “sales” represent the weeks were no Tv ads have been shown and therefore are not missing data.

Lets create a time series for our data

#We start by dropping our Date column 
Time_series <- df[, -1]
ts_data <- ts(Time_series, start = c(2009, 1), frequency = 52)
plot(ts_data)

Our graph is not particularly informative. We could try dropping columns and combine them, but this will take a bit of time (20 combinations):

x <-c("Price", "Dist", "Temp", "Sales", "Tv.Ads")
pair_combinations <- combn(x, 2)
pair_combinations
##      [,1]    [,2]    [,3]    [,4]     [,5]   [,6]    [,7]     [,8]    [,9]    
## [1,] "Price" "Price" "Price" "Price"  "Dist" "Dist"  "Dist"   "Temp"  "Temp"  
## [2,] "Dist"  "Temp"  "Sales" "Tv.Ads" "Temp" "Sales" "Tv.Ads" "Sales" "Tv.Ads"
##      [,10]   
## [1,] "Sales" 
## [2,] "Tv.Ads"

Let’s instead choose our pairs using correlation:

cor(Time_series)
##              Temperature Distribution       Price      TV.ads Sales.Volume
## Temperature   1.00000000   0.05725572 -0.13835789 -0.02737689    0.3984377
## Distribution  0.05725572   1.00000000  0.24630061  0.24235407    0.3687539
## Price        -0.13835789   0.24630061  1.00000000  0.03461032   -0.1306162
## TV.ads       -0.02737689   0.24235407  0.03461032  1.00000000    0.3247659
## Sales.Volume  0.39843770   0.36875391 -0.13061616  0.32476587    1.0000000

None of our variables seems to be highly correlated. We can however, investigate the pairs that seems more correlated: “Sales - Temperature” “Sales - Distribution” “Sales - TV-Ads”

Lets go back to our Time Series

# Sales vs Distribution
par(mfrow = c(2, 2))
ts_data <- ts(Time_series$Temperature, start = c(2009, 1), frequency = 52)
ts_data_1 <- ts(Time_series$Distribution, start = c(2009, 1), frequency = 52)
ts_data_2 <- ts(Time_series$TV.ads, start = c(2009, 1), frequency = 52)
ts_data_3 <- ts(Time_series$Sales.Volume, start = c(2009, 1), frequency = 52)

plot(ts_data, ylab="Temperature")
plot(ts_data_1, ylab="Distribution")
plot(ts_data_2, ylab="TV.ads")
plot(ts_data_3, ylab="Sales.Volume")

The sales volume is not particularly strongly associated with any of our variables of interest. Mathematically none of the variables are strongly correlated, and the plots tend to confirm this result. Distribution has increased in these two years, but the level of sales remains either erratic or constant. Temperature (seasonal effect?) does not impact either the level of distribution or the level of sales. The level of sales is not related to price: reflecting the fact that the company follows the market price and cannot (or won’t) change it. The spike in the level of ads did not change the sales status either. The firm may have reached diminishing returns on TV ads. More expenditure on TV ads has no impact whatsoever on the level of sales.

Creating a model

# set the seed for reproducibility
set.seed(123)

# split the data into 70% training and 30% testing
train_index <- sample(nrow(df), round(0.7 * nrow(df)))
train_data <- Time_series[train_index, ]
test_data <- Time_series[-train_index, ]
#Lets start creating a linear regression model (non scaled)
set.seed(345)
lm_model <-lm(data=train_data, train_data$Sales.Volume ~ train_data$Temperature + train_data$Distribution + train_data$Price + I(train_data$TV.ads^2))
summary(lm_model)
## 
## Call:
## lm(formula = train_data$Sales.Volume ~ train_data$Temperature + 
##     train_data$Distribution + train_data$Price + I(train_data$TV.ads^2), 
##     data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.3286  -3.0742   0.1812   4.1699  10.7722 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             76.114457  17.543895   4.339 4.87e-05 ***
## train_data$Temperature   0.260545   0.060237   4.325 5.11e-05 ***
## train_data$Distribution 71.233103  21.337465   3.338  0.00137 ** 
## train_data$Price        -0.565401   0.403169  -1.402  0.16535    
## I(train_data$TV.ads^2)   0.005134   0.004080   1.258  0.21263    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.32 on 68 degrees of freedom
## Multiple R-squared:  0.3499, Adjusted R-squared:  0.3117 
## F-statistic:  9.15 on 4 and 68 DF,  p-value: 5.645e-06

Just temperature and Distribution are statistically significant (to a 95% confidence interval or more).

Our model is not great: it explains only 35% of the variation in sales. By reading our coefficients, one unit change in Temperature increases sales (measured in 1000s) by 260 and one unit change in Distribution increases sales by 71.

Lets check the normality plots before evaluating other models:

plot(lm_model)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Our first model tends to normality (Q-Q Plot). Our model seems to be linear if we ignore the two outliers (i.e., observation 76)
Similarly, our model is homoskedastic without considering the two outliers (most of the observations are equally distributed around the line).
Finally, Residual vs Leverage shows that we have two outliers (97 and 76) that are leverage points too. We will have to eliminate these two observations (76 and 97) before carrying out further analysis.

row_index <- which(train_data$Temperature == 117.86)
row_index_1 <- which(train_data$TV.ads == 40)

train_data <-train_data[-row_index,]
train_data <- train_data[-row_index_1,]

Check the new dataframe

summary(train_data)
##   Temperature     Distribution        Price            TV.ads     
##  Min.   :12.46   Min.   :0.7700   Min.   : 0.800   Min.   :0.000  
##  1st Qu.:21.05   1st Qu.:0.8000   1st Qu.: 1.115   1st Qu.:0.000  
##  Median :26.37   Median :0.8200   Median : 1.210   Median :0.000  
##  Mean   :25.21   Mean   :0.8264   Mean   : 1.479   Mean   :1.677  
##  3rd Qu.:30.43   3rd Qu.:0.8550   3rd Qu.: 1.230   3rd Qu.:2.750  
##  Max.   :37.90   Max.   :0.9000   Max.   :15.400   Max.   :8.400  
##   Sales.Volume  
##  Min.   :120.7  
##  1st Qu.:137.3  
##  Median :141.5  
##  Mean   :141.0  
##  3rd Qu.:145.4  
##  Max.   :156.4

Lets run our model again:

set.seed(235)
lm_model <-lm(data=train_data, train_data$Sales.Volume ~ train_data$Temperature + train_data$Distribution + train_data$Price + I(train_data$TV.ads^2))
summary(lm_model)
## 
## Call:
## lm(formula = train_data$Sales.Volume ~ train_data$Temperature + 
##     train_data$Distribution + train_data$Price + I(train_data$TV.ads^2), 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2253 -3.1470  0.0357  2.7936  9.7810 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             47.00597   11.13330   4.222 7.57e-05 ***
## train_data$Temperature   0.93751    0.07609  12.321  < 2e-16 ***
## train_data$Distribution 84.02633   13.13833   6.396 1.90e-08 ***
## train_data$Price        -0.05616    0.25565  -0.220    0.827    
## I(train_data$TV.ads^2)   0.10659    0.02408   4.427 3.67e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.862 on 66 degrees of freedom
## Multiple R-squared:  0.7542, Adjusted R-squared:  0.7393 
## F-statistic: 50.63 on 4 and 66 DF,  p-value: < 2.2e-16

Our model has dramatically improved: another independent variable is now statistically significant and our model explains much more of the variation in the dependant variable: 75%. Before interpreting the coefficients, we run diagnostic plots:

plot(lm_model)

row_index_2<- which(train_data$Price == 15.4)
train_data <- train_data[-row_index_2,]
summary(train_data)
##   Temperature     Distribution        Price           TV.ads     
##  Min.   :13.42   Min.   :0.7700   Min.   :0.800   Min.   :0.000  
##  1st Qu.:21.30   1st Qu.:0.8000   1st Qu.:1.113   1st Qu.:0.000  
##  Median :26.49   Median :0.8200   Median :1.210   Median :0.000  
##  Mean   :25.39   Mean   :0.8255   Mean   :1.280   Mean   :1.666  
##  3rd Qu.:30.57   3rd Qu.:0.8500   3rd Qu.:1.230   3rd Qu.:2.750  
##  Max.   :37.90   Max.   :0.9000   Max.   :9.700   Max.   :8.400  
##   Sales.Volume  
##  Min.   :120.7  
##  1st Qu.:137.6  
##  Median :141.8  
##  Mean   :141.1  
##  3rd Qu.:145.4  
##  Max.   :156.4

Run our model and plot diagnostics for a second time.

lm_model <-lm(data=train_data, train_data$Sales.Volume ~ train_data$Temperature + train_data$Distribution + train_data$Price + I(train_data$TV.ads^2))
summary(lm_model)
## 
## Call:
## lm(formula = train_data$Sales.Volume ~ train_data$Temperature + 
##     train_data$Distribution + train_data$Price + I(train_data$TV.ads^2), 
##     data = train_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.277 -3.085  0.046  2.868  9.600 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             48.15300   11.08186   4.345 4.99e-05 ***
## train_data$Temperature   0.92991    0.07573  12.280  < 2e-16 ***
## train_data$Distribution 83.61023   13.04550   6.409 1.89e-08 ***
## train_data$Price        -0.63669    0.48462  -1.314    0.194    
## I(train_data$TV.ads^2)   0.11613    0.02485   4.674 1.54e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.833 on 65 degrees of freedom
## Multiple R-squared:  0.7599, Adjusted R-squared:  0.7451 
## F-statistic: 51.43 on 4 and 65 DF,  p-value: < 2.2e-16
plot(lm_model)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

While we still have other outliers, eliminating any additional outliers will have minimal effect on both the shape and performance of our model (R and R2 have just slightly changed).

Instead, we scale our model to compare the impact on the dependent variable amongst the predictors.

scaled_lm_model <-lm(data=train_data, train_data$Sales.Volume ~ scale(train_data$Temperature) + scale(train_data$Distribution) + scale(train_data$Price) + scale(I(train_data$TV.ads^2)))

summary(scaled_lm_model)
## 
## Call:
## lm(formula = train_data$Sales.Volume ~ scale(train_data$Temperature) + 
##     scale(train_data$Distribution) + scale(train_data$Price) + 
##     scale(I(train_data$TV.ads^2)), data = train_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.277 -3.085  0.046  2.868  9.600 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    141.1151     0.4582 307.985  < 2e-16 ***
## scale(train_data$Temperature)    5.8731     0.4783  12.280  < 2e-16 ***
## scale(train_data$Distribution)   3.0032     0.4686   6.409 1.89e-08 ***
## scale(train_data$Price)         -0.6558     0.4992  -1.314    0.194    
## scale(I(train_data$TV.ads^2))    2.2733     0.4864   4.674 1.54e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.833 on 65 degrees of freedom
## Multiple R-squared:  0.7599, Adjusted R-squared:  0.7451 
## F-statistic: 51.43 on 4 and 65 DF,  p-value: < 2.2e-16

Temperature, distribution and TV ads are statistically significant. Because we have standardised our independent variables, we can now compare and say which variable has a higher impact on sales: in this case, Temperature, followed by Distribution and Tv ads.

The coefficients are read as follows,

sd(train_data$Temperature)
 #One point increase in standard deviation increase in temperature (6.3 degrees) leads to increase of 1000 x 5.87 

sd(train_data$Distribution)
# One point increase in standard deviation in Distribution (0.03%), leads to an increase of 1000 x 3.0032  

sd(train_data$TV.ads)
#One standard deviation increase in TV ads (2.686) increases sales by 2.27

Lets run our model in our validation set.

scaled_lm_model_val <-lm(data=test_data, test_data$Sales.Volume ~ scale(test_data$Temperature) + scale(test_data$Distribution) + scale(test_data$Price) + scale(I(test_data$TV.ads^2)))
summary(scaled_lm_model_val)
## 
## Call:
## lm(formula = test_data$Sales.Volume ~ scale(test_data$Temperature) + 
##     scale(test_data$Distribution) + scale(test_data$Price) + 
##     scale(I(test_data$TV.ads^2)), data = test_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.512  -4.255  -1.253   4.506  14.548 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    139.049      1.321 105.276  < 2e-16 ***
## scale(test_data$Temperature)     2.040      1.401   1.456  0.15724    
## scale(test_data$Distribution)    4.625      1.603   2.885  0.00777 ** 
## scale(test_data$Price)          -3.147      1.511  -2.083  0.04718 *  
## scale(I(test_data$TV.ads^2))     0.865      1.446   0.598  0.55487    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.354 on 26 degrees of freedom
## Multiple R-squared:  0.4095, Adjusted R-squared:  0.3186 
## F-statistic: 4.507 on 4 and 26 DF,  p-value: 0.006717
plot(scaled_lm_model_val)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Our model performs poorly in our validation test: only 40% of the variation is explained. However, this is due to the presence of outliers (for example: a recorded temperature of 130 Celsius is a wrong) We quickly fix these mistakes:

row_index_3<- which(test_data$TV.ads == 30)
row_index_4<- which(test_data$Price == 5.6)
row_index_5<- which(test_data$Temperature == 130.01)

test_data <- test_data[-row_index_3, ]
test_data <- test_data[-row_index_4, ]
test_data <- test_data[-row_index_5, ]

row_index_6<- which(test_data$Price == 5.6)
test_data <- test_data[-row_index_6,]
summary(test_data)
##   Temperature     Distribution        Price           TV.ads     
##  Min.   :11.90   Min.   :0.7700   Min.   :0.890   Min.   :0.000  
##  1st Qu.:18.66   1st Qu.:0.7975   1st Qu.:1.180   1st Qu.:0.000  
##  Median :22.81   Median :0.8200   Median :1.225   Median :0.000  
##  Mean   :23.28   Mean   :0.8192   Mean   :1.186   Mean   :1.686  
##  3rd Qu.:29.33   3rd Qu.:0.8425   3rd Qu.:1.240   3rd Qu.:2.875  
##  Max.   :32.67   Max.   :0.8900   Max.   :1.428   Max.   :8.500  
##   Sales.Volume  
##  Min.   :120.0  
##  1st Qu.:134.6  
##  Median :138.9  
##  Mean   :138.7  
##  3rd Qu.:144.6  
##  Max.   :153.8

We run again our model:

scaled_lm_model_val <-lm(data=test_data, test_data$Sales.Volume ~ scale(test_data$Temperature) + scale(test_data$Distribution) + scale(test_data$Price) + scale(I(test_data$TV.ads^2)))
summary(scaled_lm_model_val)
## 
## Call:
## lm(formula = test_data$Sales.Volume ~ scale(test_data$Temperature) + 
##     scale(test_data$Distribution) + scale(test_data$Price) + 
##     scale(I(test_data$TV.ads^2)), data = test_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3251 -2.4532 -0.0641  2.4927  5.5717 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   138.7011     0.7301 189.983  < 2e-16 ***
## scale(test_data$Temperature)    6.0771     0.7528   8.073 3.67e-08 ***
## scale(test_data$Distribution)   3.4337     0.7629   4.501 0.000161 ***
## scale(test_data$Price)         -1.4408     0.7491  -1.923 0.066884 .  
## scale(I(test_data$TV.ads^2))    2.1667     0.7588   2.855 0.008951 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.863 on 23 degrees of freedom
## Multiple R-squared:  0.8414, Adjusted R-squared:  0.8138 
## F-statistic: 30.51 on 4 and 23 DF,  p-value: 6.783e-09
plot(scaled_lm_model_val)

Our model is now much better and closer in performance to the training data.
Lets plot the three most significant variables:

#Distribution Plot 
plot(test_data$Distribution, test_data$Sales.Volume, ylab="Sales", xlab="Distribution", main="Regression Line Sales vs Distribution")
abline(lm(Sales.Volume ~ Distribution, data=test_data), col="red")

#Temperature plot 
plot(test_data$Temperature, test_data$Sales.Volume, ylab="Sales", xlab="Temperature", main="Regression Line Sales vs Temperature")
abline(lm(Sales.Volume ~ Temperature, data=test_data), col="red")

#Tv.Ads plot 
plot(test_data$TV.ads, test_data$Sales.Volume, ylab="Sales", xlab="TV.Ads", main="Regression Line Sales vs TV.Ads")
abline(lm(Sales.Volume ~ I(TV.ads^2), data=test_data), col="red")