goal : Predict weekly sales for departments for the following year

loading data

library(readr)
library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(corrplot)
library(rms)
library(grid)
features_data <- read_csv("Features data set.csv")
## Parsed with column specification:
## cols(
##   Store = col_integer(),
##   Date = col_character(),
##   Temperature = col_double(),
##   Fuel_Price = col_double(),
##   MarkDown1 = col_double(),
##   MarkDown2 = col_double(),
##   MarkDown3 = col_double(),
##   MarkDown4 = col_double(),
##   MarkDown5 = col_double(),
##   CPI = col_double(),
##   Unemployment = col_double(),
##   IsHoliday = col_logical()
## )
sales_data <- read_csv("sales data-set.csv")
## Parsed with column specification:
## cols(
##   Store = col_integer(),
##   Dept = col_integer(),
##   Date = col_character(),
##   Weekly_Sales = col_double(),
##   IsHoliday = col_logical()
## )
stores_data <- read_csv("stores data-set.csv")
## Parsed with column specification:
## cols(
##   Store = col_integer(),
##   Type = col_character(),
##   Size = col_integer()
## )

merge data

#merge data
sales_merge_1  <- merge(sales_data,features_data)

sales_merge_2  <- merge(sales_merge_1,stores_data)

#re-order column
sales_merge_3 <- sales_merge_2[,c(1,15,16, 2:14)]

data wrangling

# set var.to correct class (factor) 
sales_merge_3$Store <- factor(sales_merge_3$Store, ordered = F)
sales_merge_3$Type <- factor(sales_merge_3$Type)
sales_merge_3$Dept <- factor(sales_merge_3$Dept)

#Date
sales_merge_3$Date <- dmy(sales_merge_3$Date)

divide data by different years

sales_data_2010 <- sales_merge_3 %>% filter( year(sales_merge_3$Date) == "2010" )
sales_data_2011 <- sales_merge_3 %>% filter( year(sales_merge_3$Date) == "2011" )
sales_data_2012 <- sales_merge_3 %>% filter( year(sales_merge_3$Date) == "2012" )

visualize categorical variables over weekly_sales

ggplot(sales_data_2010  %>% filter(sales_data_2010$Weekly_Sales < 10000 ))+
  geom_boxplot(aes(x= Type , y= Weekly_Sales))

see how holiday and markdown affect sales

ggplot(sales_data_2010)+
  geom_boxplot(aes(x= IsHoliday , y= Weekly_Sales))

# note that outliers

There’s no much difference of sales on whether its holiday or not; however, the there’s some interesting outliner.

ggplot(sales_data_2010 %>% filter(sales_data_2010$Weekly_Sales<10000 ))+
  geom_boxplot(aes(x= IsHoliday , y= Weekly_Sales))

sales_LongFormat <- sales_merge_3 %>% gather(key = variables, value = "n", 7:16 )

ggplot(sales_LongFormat, aes(x= Date , y = n, fill = sales_LongFormat$variables))+
  geom_line(aes(colour = sales_LongFormat$variables, fill = sales_LongFormat$variables))+
  facet_grid(sales_LongFormat$variables ~ .)
## Warning: Ignoring unknown aesthetics: fill
## Warning: Removed 1350690 rows containing missing values (geom_path).

the empty part are missing data, also the y scale need to be adjusted

correlation

sales_data_2010 %>% 
  dplyr::select(7:9,15,16) %>% 
  cor() %>% 
  corrplot(method = "number")

#CPI = Consumer Price Index

Build Model

sales_data_2010.11 <- rbind(sales_data_2010,sales_data_2011)

multipleLM <- lm(formula = sales_data_2011$Weekly_Sales ~ Type + Size + Date + IsHoliday+Dept+ Temperature + Fuel_Price + CPI + Unemployment , data = sales_data_2011)

summary(multipleLM)
## 
## Call:
## lm(formula = sales_data_2011$Weekly_Sales ~ Type + Size + Date + 
##     IsHoliday + Dept + Temperature + Fuel_Price + CPI + Unemployment, 
##     data = sales_data_2011)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65321  -6256  -1361   4699 598333 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.126e+05  5.501e+03 -20.462  < 2e-16 ***
## TypeB          2.197e+03  1.146e+02  19.165  < 2e-16 ***
## TypeC          3.050e+03  1.932e+02  15.790  < 2e-16 ***
## Size           1.263e-01  1.030e-03 122.614  < 2e-16 ***
## Date           8.364e+00  3.606e-01  23.192  < 2e-16 ***
## IsHolidayTRUE  9.302e+02  1.401e+02   6.638 3.19e-11 ***
## Dept2          2.400e+04  4.113e+02  58.345  < 2e-16 ***
## Dept3         -7.834e+03  4.113e+02 -19.046  < 2e-16 ***
## Dept4          6.503e+03  4.113e+02  15.810  < 2e-16 ***
## Dept5          2.241e+03  4.123e+02   5.435 5.48e-08 ***
## Dept6         -1.558e+04  4.190e+02 -37.174  < 2e-16 ***
## Dept7          5.921e+03  4.113e+02  14.395  < 2e-16 ***
## Dept8          1.055e+04  4.113e+02  25.650  < 2e-16 ***
## Dept9         -4.175e+01  4.125e+02  -0.101  0.91939    
## Dept10        -1.050e+03  4.113e+02  -2.552  0.01071 *  
## Dept11        -4.934e+03  4.113e+02 -11.996  < 2e-16 ***
## Dept12        -1.539e+04  4.113e+02 -37.409  < 2e-16 ***
## Dept13         1.076e+04  4.113e+02  26.161  < 2e-16 ***
## Dept14        -4.412e+03  4.113e+02 -10.726  < 2e-16 ***
## Dept16        -6.423e+03  4.113e+02 -15.616  < 2e-16 ***
## Dept17        -9.321e+03  4.114e+02 -22.657  < 2e-16 ***
## Dept18        -1.165e+04  4.341e+02 -26.848  < 2e-16 ***
## Dept19        -2.042e+04  4.641e+02 -44.001  < 2e-16 ***
## Dept20        -1.466e+04  4.181e+02 -35.073  < 2e-16 ***
## Dept21        -1.450e+04  4.113e+02 -35.261  < 2e-16 ***
## Dept22        -1.039e+04  4.280e+02 -24.281  < 2e-16 ***
## Dept23         3.013e+03  4.205e+02   7.165 7.82e-13 ***
## Dept24        -1.506e+04  4.302e+02 -35.005  < 2e-16 ***
## Dept25        -1.058e+04  4.123e+02 -25.668  < 2e-16 ***
## Dept26        -1.299e+04  4.242e+02 -30.631  < 2e-16 ***
## Dept27        -1.905e+04  4.228e+02 -45.056  < 2e-16 ***
## Dept28        -1.937e+04  4.164e+02 -46.527  < 2e-16 ***
## Dept29        -1.593e+04  4.330e+02 -36.795  < 2e-16 ***
## Dept30        -1.765e+04  4.333e+02 -40.726  < 2e-16 ***
## Dept31        -1.774e+04  4.184e+02 -42.386  < 2e-16 ***
## Dept32        -1.348e+04  4.219e+02 -31.958  < 2e-16 ***
## Dept33        -1.501e+04  4.309e+02 -34.825  < 2e-16 ***
## Dept34        -6.692e+03  4.322e+02 -15.484  < 2e-16 ***
## Dept35        -1.868e+04  4.330e+02 -43.127  < 2e-16 ***
## Dept36        -1.955e+04  4.333e+02 -45.122  < 2e-16 ***
## Dept37        -2.240e+04  5.445e+02 -41.133  < 2e-16 ***
## Dept38         4.184e+04  4.113e+02 101.729  < 2e-16 ***
## Dept39        -2.917e+04  1.407e+04  -2.073  0.03821 *  
## Dept40         2.477e+04  4.113e+02  60.232  < 2e-16 ***
## Dept41        -1.914e+04  4.300e+02 -44.514  < 2e-16 ***
## Dept42        -1.429e+04  4.113e+02 -34.751  < 2e-16 ***
## Dept43        -2.360e+04  8.128e+03  -2.904  0.00369 ** 
## Dept44        -1.662e+04  4.306e+02 -38.586  < 2e-16 ***
## Dept45        -2.156e+04  6.057e+02 -35.596  < 2e-16 ***
## Dept46         7.227e+02  4.113e+02   1.757  0.07891 .  
## Dept47        -2.186e+04  1.025e+03 -21.329  < 2e-16 ***
## Dept48        -2.231e+04  6.248e+02 -35.710  < 2e-16 ***
## Dept49        -1.473e+04  4.603e+02 -32.007  < 2e-16 ***
## Dept50        -2.159e+04  6.555e+02 -32.935  < 2e-16 ***
## Dept51        -2.043e+04  7.716e+02 -26.482  < 2e-16 ***
## Dept52        -1.770e+04  4.134e+02 -42.803  < 2e-16 ***
## Dept54        -2.192e+04  4.457e+02 -49.182  < 2e-16 ***
## Dept55        -9.584e+03  4.267e+02 -22.462  < 2e-16 ***
## Dept56        -1.684e+04  4.212e+02 -39.981  < 2e-16 ***
## Dept58        -1.943e+04  4.575e+02 -42.470  < 2e-16 ***
## Dept59        -1.916e+04  4.151e+02 -46.155  < 2e-16 ***
## Dept60        -1.899e+04  4.144e+02 -45.819  < 2e-16 ***
## Dept65         1.944e+04  1.974e+03   9.844  < 2e-16 ***
## Dept67        -1.207e+04  4.113e+02 -29.335  < 2e-16 ***
## Dept71        -1.598e+04  4.330e+02 -36.897  < 2e-16 ***
## Dept72         3.208e+04  4.178e+02  76.798  < 2e-16 ***
## Dept74        -5.729e+03  4.113e+02 -13.928  < 2e-16 ***
## Dept77        -2.243e+04  1.482e+03 -15.141  < 2e-16 ***
## Dept78        -2.399e+04  1.886e+03 -12.718  < 2e-16 ***
## Dept79         2.279e+03  4.113e+02   5.540 3.03e-08 ***
## Dept80        -7.675e+03  4.201e+02 -18.269  < 2e-16 ***
## Dept81        -3.950e+03  4.113e+02  -9.602  < 2e-16 ***
## Dept82        -3.184e+03  4.113e+02  -7.740 1.00e-14 ***
## Dept83        -1.679e+04  4.210e+02 -39.881  < 2e-16 ***
## Dept85        -1.790e+04  4.184e+02 -42.794  < 2e-16 ***
## Dept87        -6.047e+03  4.117e+02 -14.689  < 2e-16 ***
## Dept90         2.675e+04  4.113e+02  65.040  < 2e-16 ***
## Dept91         1.432e+04  4.113e+02  34.826  < 2e-16 ***
## Dept92         5.604e+04  4.113e+02 136.239  < 2e-16 ***
## Dept93         7.169e+03  4.209e+02  17.032  < 2e-16 ***
## Dept94         1.400e+04  4.270e+02  32.784  < 2e-16 ***
## Dept95         4.967e+04  4.113e+02 120.752  < 2e-16 ***
## Dept96        -3.798e+03  4.460e+02  -8.514  < 2e-16 ***
## Dept97        -5.091e+03  4.139e+02 -12.300  < 2e-16 ***
## Dept98        -1.310e+04  4.213e+02 -31.089  < 2e-16 ***
## Dept99        -2.537e+04  8.539e+02 -29.707  < 2e-16 ***
## Temperature    2.248e+01  2.474e+00   9.086  < 2e-16 ***
## Fuel_Price    -1.924e+03  1.690e+02 -11.383  < 2e-16 ***
## CPI           -2.230e+01  1.134e+00 -19.667  < 2e-16 ***
## Unemployment  -4.043e+02  2.088e+01 -19.362  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14070 on 153363 degrees of freedom
## Multiple R-squared:  0.6217, Adjusted R-squared:  0.6214 
## F-statistic:  2831 on 89 and 153363 DF,  p-value: < 2.2e-16
plot(multipleLM)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Checking variance inflation: all below 5, which is safe to proceed

vif(multipleLM)
##         TypeB         TypeC          Size          Date IsHolidayTRUE 
##      2.418269      2.629190      3.062631      1.116721      1.090634 
##         Dept2         Dept3         Dept4         Dept5         Dept6 
##      1.969502      1.969502      1.969502      1.960136      1.903070 
##         Dept7         Dept8         Dept9        Dept10        Dept11 
##      1.969502      1.969502      1.958520      1.969502      1.969502 
##        Dept12        Dept13        Dept14        Dept16        Dept17 
##      1.969502      1.969502      1.969502      1.969502      1.968687 
##        Dept18        Dept19        Dept20        Dept21        Dept22 
##      1.793569      1.635154      1.911246      1.969502      1.837344 
##        Dept23        Dept24        Dept25        Dept26        Dept27 
##      1.891335      1.822209      1.960142      1.864681      1.873611 
##        Dept28        Dept29        Dept30        Dept31        Dept32 
##      1.925556      1.804426      1.802469      1.907884      1.880869 
##        Dept33        Dept34        Dept35        Dept36        Dept37 
##      1.817861      1.809588      1.804466      1.802469      1.398034 
##        Dept38        Dept39        Dept40        Dept41        Dept42 
##      1.969502      1.000466      1.969502      1.823846      1.969502 
##        Dept43        Dept44        Dept45        Dept46        Dept47 
##      1.001311      1.819803      1.295210      1.969502      1.086677 
##        Dept48        Dept49        Dept50        Dept51        Dept52 
##      1.274421      1.653703      1.245829      1.163942      1.950436 
##        Dept54        Dept55        Dept56        Dept58        Dept59 
##      1.727231      1.847765      1.885637      1.666762      1.935758 
##        Dept60        Dept65        Dept67        Dept71        Dept72 
##      1.941788      1.023789      1.969502      1.804427      1.913708 
##        Dept74        Dept77        Dept78        Dept79        Dept80 
##      1.969502      1.041860      1.024348      1.969502      1.894329 
##        Dept81        Dept82        Dept83        Dept85        Dept87 
##      1.969502      1.969502      1.887486      1.908837      1.966241 
##        Dept90        Dept91        Dept92        Dept93        Dept94 
##      1.969502      1.969502      1.969502      1.887894      1.841733 
##        Dept95        Dept96        Dept97        Dept98        Dept99 
##      1.969502      1.722169      1.945883      1.884949      1.132346 
##   Temperature    Fuel_Price           CPI  Unemployment 
##      1.839111      1.840216      1.500770      1.177636

make prediction on weekly sales

preSales <- predict(multipleLM , newdata = sales_data_2012)  
mean(preSales, na.rm = T)
## [1] 18705.25
mean(sales_data_2012$Weekly_Sales)
## [1] 15694.95

Future Improvements:

Need to check if there’s overfitting. Try feature engineering to improve accuracy.