goal : Predict weekly sales for departments for the following year
library(readr)
library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(corrplot)
library(rms)
library(grid)
features_data <- read_csv("Features data set.csv")
## Parsed with column specification:
## cols(
## Store = col_integer(),
## Date = col_character(),
## Temperature = col_double(),
## Fuel_Price = col_double(),
## MarkDown1 = col_double(),
## MarkDown2 = col_double(),
## MarkDown3 = col_double(),
## MarkDown4 = col_double(),
## MarkDown5 = col_double(),
## CPI = col_double(),
## Unemployment = col_double(),
## IsHoliday = col_logical()
## )
sales_data <- read_csv("sales data-set.csv")
## Parsed with column specification:
## cols(
## Store = col_integer(),
## Dept = col_integer(),
## Date = col_character(),
## Weekly_Sales = col_double(),
## IsHoliday = col_logical()
## )
stores_data <- read_csv("stores data-set.csv")
## Parsed with column specification:
## cols(
## Store = col_integer(),
## Type = col_character(),
## Size = col_integer()
## )
#merge data
sales_merge_1 <- merge(sales_data,features_data)
sales_merge_2 <- merge(sales_merge_1,stores_data)
#re-order column
sales_merge_3 <- sales_merge_2[,c(1,15,16, 2:14)]
# set var.to correct class (factor)
sales_merge_3$Store <- factor(sales_merge_3$Store, ordered = F)
sales_merge_3$Type <- factor(sales_merge_3$Type)
sales_merge_3$Dept <- factor(sales_merge_3$Dept)
#Date
sales_merge_3$Date <- dmy(sales_merge_3$Date)
sales_data_2010 <- sales_merge_3 %>% filter( year(sales_merge_3$Date) == "2010" )
sales_data_2011 <- sales_merge_3 %>% filter( year(sales_merge_3$Date) == "2011" )
sales_data_2012 <- sales_merge_3 %>% filter( year(sales_merge_3$Date) == "2012" )
ggplot(sales_data_2010 %>% filter(sales_data_2010$Weekly_Sales < 10000 ))+
geom_boxplot(aes(x= Type , y= Weekly_Sales))
see how holiday and markdown affect sales
ggplot(sales_data_2010)+
geom_boxplot(aes(x= IsHoliday , y= Weekly_Sales))
# note that outliers
There’s no much difference of sales on whether its holiday or not; however, the there’s some interesting outliner.
ggplot(sales_data_2010 %>% filter(sales_data_2010$Weekly_Sales<10000 ))+
geom_boxplot(aes(x= IsHoliday , y= Weekly_Sales))
sales_LongFormat <- sales_merge_3 %>% gather(key = variables, value = "n", 7:16 )
ggplot(sales_LongFormat, aes(x= Date , y = n, fill = sales_LongFormat$variables))+
geom_line(aes(colour = sales_LongFormat$variables, fill = sales_LongFormat$variables))+
facet_grid(sales_LongFormat$variables ~ .)
## Warning: Ignoring unknown aesthetics: fill
## Warning: Removed 1350690 rows containing missing values (geom_path).
sales_data_2010 %>%
dplyr::select(7:9,15,16) %>%
cor() %>%
corrplot(method = "number")
#CPI = Consumer Price Index
sales_data_2010.11 <- rbind(sales_data_2010,sales_data_2011)
multipleLM <- lm(formula = sales_data_2011$Weekly_Sales ~ Type + Size + Date + IsHoliday+Dept+ Temperature + Fuel_Price + CPI + Unemployment , data = sales_data_2011)
summary(multipleLM)
##
## Call:
## lm(formula = sales_data_2011$Weekly_Sales ~ Type + Size + Date +
## IsHoliday + Dept + Temperature + Fuel_Price + CPI + Unemployment,
## data = sales_data_2011)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65321 -6256 -1361 4699 598333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.126e+05 5.501e+03 -20.462 < 2e-16 ***
## TypeB 2.197e+03 1.146e+02 19.165 < 2e-16 ***
## TypeC 3.050e+03 1.932e+02 15.790 < 2e-16 ***
## Size 1.263e-01 1.030e-03 122.614 < 2e-16 ***
## Date 8.364e+00 3.606e-01 23.192 < 2e-16 ***
## IsHolidayTRUE 9.302e+02 1.401e+02 6.638 3.19e-11 ***
## Dept2 2.400e+04 4.113e+02 58.345 < 2e-16 ***
## Dept3 -7.834e+03 4.113e+02 -19.046 < 2e-16 ***
## Dept4 6.503e+03 4.113e+02 15.810 < 2e-16 ***
## Dept5 2.241e+03 4.123e+02 5.435 5.48e-08 ***
## Dept6 -1.558e+04 4.190e+02 -37.174 < 2e-16 ***
## Dept7 5.921e+03 4.113e+02 14.395 < 2e-16 ***
## Dept8 1.055e+04 4.113e+02 25.650 < 2e-16 ***
## Dept9 -4.175e+01 4.125e+02 -0.101 0.91939
## Dept10 -1.050e+03 4.113e+02 -2.552 0.01071 *
## Dept11 -4.934e+03 4.113e+02 -11.996 < 2e-16 ***
## Dept12 -1.539e+04 4.113e+02 -37.409 < 2e-16 ***
## Dept13 1.076e+04 4.113e+02 26.161 < 2e-16 ***
## Dept14 -4.412e+03 4.113e+02 -10.726 < 2e-16 ***
## Dept16 -6.423e+03 4.113e+02 -15.616 < 2e-16 ***
## Dept17 -9.321e+03 4.114e+02 -22.657 < 2e-16 ***
## Dept18 -1.165e+04 4.341e+02 -26.848 < 2e-16 ***
## Dept19 -2.042e+04 4.641e+02 -44.001 < 2e-16 ***
## Dept20 -1.466e+04 4.181e+02 -35.073 < 2e-16 ***
## Dept21 -1.450e+04 4.113e+02 -35.261 < 2e-16 ***
## Dept22 -1.039e+04 4.280e+02 -24.281 < 2e-16 ***
## Dept23 3.013e+03 4.205e+02 7.165 7.82e-13 ***
## Dept24 -1.506e+04 4.302e+02 -35.005 < 2e-16 ***
## Dept25 -1.058e+04 4.123e+02 -25.668 < 2e-16 ***
## Dept26 -1.299e+04 4.242e+02 -30.631 < 2e-16 ***
## Dept27 -1.905e+04 4.228e+02 -45.056 < 2e-16 ***
## Dept28 -1.937e+04 4.164e+02 -46.527 < 2e-16 ***
## Dept29 -1.593e+04 4.330e+02 -36.795 < 2e-16 ***
## Dept30 -1.765e+04 4.333e+02 -40.726 < 2e-16 ***
## Dept31 -1.774e+04 4.184e+02 -42.386 < 2e-16 ***
## Dept32 -1.348e+04 4.219e+02 -31.958 < 2e-16 ***
## Dept33 -1.501e+04 4.309e+02 -34.825 < 2e-16 ***
## Dept34 -6.692e+03 4.322e+02 -15.484 < 2e-16 ***
## Dept35 -1.868e+04 4.330e+02 -43.127 < 2e-16 ***
## Dept36 -1.955e+04 4.333e+02 -45.122 < 2e-16 ***
## Dept37 -2.240e+04 5.445e+02 -41.133 < 2e-16 ***
## Dept38 4.184e+04 4.113e+02 101.729 < 2e-16 ***
## Dept39 -2.917e+04 1.407e+04 -2.073 0.03821 *
## Dept40 2.477e+04 4.113e+02 60.232 < 2e-16 ***
## Dept41 -1.914e+04 4.300e+02 -44.514 < 2e-16 ***
## Dept42 -1.429e+04 4.113e+02 -34.751 < 2e-16 ***
## Dept43 -2.360e+04 8.128e+03 -2.904 0.00369 **
## Dept44 -1.662e+04 4.306e+02 -38.586 < 2e-16 ***
## Dept45 -2.156e+04 6.057e+02 -35.596 < 2e-16 ***
## Dept46 7.227e+02 4.113e+02 1.757 0.07891 .
## Dept47 -2.186e+04 1.025e+03 -21.329 < 2e-16 ***
## Dept48 -2.231e+04 6.248e+02 -35.710 < 2e-16 ***
## Dept49 -1.473e+04 4.603e+02 -32.007 < 2e-16 ***
## Dept50 -2.159e+04 6.555e+02 -32.935 < 2e-16 ***
## Dept51 -2.043e+04 7.716e+02 -26.482 < 2e-16 ***
## Dept52 -1.770e+04 4.134e+02 -42.803 < 2e-16 ***
## Dept54 -2.192e+04 4.457e+02 -49.182 < 2e-16 ***
## Dept55 -9.584e+03 4.267e+02 -22.462 < 2e-16 ***
## Dept56 -1.684e+04 4.212e+02 -39.981 < 2e-16 ***
## Dept58 -1.943e+04 4.575e+02 -42.470 < 2e-16 ***
## Dept59 -1.916e+04 4.151e+02 -46.155 < 2e-16 ***
## Dept60 -1.899e+04 4.144e+02 -45.819 < 2e-16 ***
## Dept65 1.944e+04 1.974e+03 9.844 < 2e-16 ***
## Dept67 -1.207e+04 4.113e+02 -29.335 < 2e-16 ***
## Dept71 -1.598e+04 4.330e+02 -36.897 < 2e-16 ***
## Dept72 3.208e+04 4.178e+02 76.798 < 2e-16 ***
## Dept74 -5.729e+03 4.113e+02 -13.928 < 2e-16 ***
## Dept77 -2.243e+04 1.482e+03 -15.141 < 2e-16 ***
## Dept78 -2.399e+04 1.886e+03 -12.718 < 2e-16 ***
## Dept79 2.279e+03 4.113e+02 5.540 3.03e-08 ***
## Dept80 -7.675e+03 4.201e+02 -18.269 < 2e-16 ***
## Dept81 -3.950e+03 4.113e+02 -9.602 < 2e-16 ***
## Dept82 -3.184e+03 4.113e+02 -7.740 1.00e-14 ***
## Dept83 -1.679e+04 4.210e+02 -39.881 < 2e-16 ***
## Dept85 -1.790e+04 4.184e+02 -42.794 < 2e-16 ***
## Dept87 -6.047e+03 4.117e+02 -14.689 < 2e-16 ***
## Dept90 2.675e+04 4.113e+02 65.040 < 2e-16 ***
## Dept91 1.432e+04 4.113e+02 34.826 < 2e-16 ***
## Dept92 5.604e+04 4.113e+02 136.239 < 2e-16 ***
## Dept93 7.169e+03 4.209e+02 17.032 < 2e-16 ***
## Dept94 1.400e+04 4.270e+02 32.784 < 2e-16 ***
## Dept95 4.967e+04 4.113e+02 120.752 < 2e-16 ***
## Dept96 -3.798e+03 4.460e+02 -8.514 < 2e-16 ***
## Dept97 -5.091e+03 4.139e+02 -12.300 < 2e-16 ***
## Dept98 -1.310e+04 4.213e+02 -31.089 < 2e-16 ***
## Dept99 -2.537e+04 8.539e+02 -29.707 < 2e-16 ***
## Temperature 2.248e+01 2.474e+00 9.086 < 2e-16 ***
## Fuel_Price -1.924e+03 1.690e+02 -11.383 < 2e-16 ***
## CPI -2.230e+01 1.134e+00 -19.667 < 2e-16 ***
## Unemployment -4.043e+02 2.088e+01 -19.362 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14070 on 153363 degrees of freedom
## Multiple R-squared: 0.6217, Adjusted R-squared: 0.6214
## F-statistic: 2831 on 89 and 153363 DF, p-value: < 2.2e-16
plot(multipleLM)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Checking variance inflation: all below 5, which is safe to proceed
vif(multipleLM)
## TypeB TypeC Size Date IsHolidayTRUE
## 2.418269 2.629190 3.062631 1.116721 1.090634
## Dept2 Dept3 Dept4 Dept5 Dept6
## 1.969502 1.969502 1.969502 1.960136 1.903070
## Dept7 Dept8 Dept9 Dept10 Dept11
## 1.969502 1.969502 1.958520 1.969502 1.969502
## Dept12 Dept13 Dept14 Dept16 Dept17
## 1.969502 1.969502 1.969502 1.969502 1.968687
## Dept18 Dept19 Dept20 Dept21 Dept22
## 1.793569 1.635154 1.911246 1.969502 1.837344
## Dept23 Dept24 Dept25 Dept26 Dept27
## 1.891335 1.822209 1.960142 1.864681 1.873611
## Dept28 Dept29 Dept30 Dept31 Dept32
## 1.925556 1.804426 1.802469 1.907884 1.880869
## Dept33 Dept34 Dept35 Dept36 Dept37
## 1.817861 1.809588 1.804466 1.802469 1.398034
## Dept38 Dept39 Dept40 Dept41 Dept42
## 1.969502 1.000466 1.969502 1.823846 1.969502
## Dept43 Dept44 Dept45 Dept46 Dept47
## 1.001311 1.819803 1.295210 1.969502 1.086677
## Dept48 Dept49 Dept50 Dept51 Dept52
## 1.274421 1.653703 1.245829 1.163942 1.950436
## Dept54 Dept55 Dept56 Dept58 Dept59
## 1.727231 1.847765 1.885637 1.666762 1.935758
## Dept60 Dept65 Dept67 Dept71 Dept72
## 1.941788 1.023789 1.969502 1.804427 1.913708
## Dept74 Dept77 Dept78 Dept79 Dept80
## 1.969502 1.041860 1.024348 1.969502 1.894329
## Dept81 Dept82 Dept83 Dept85 Dept87
## 1.969502 1.969502 1.887486 1.908837 1.966241
## Dept90 Dept91 Dept92 Dept93 Dept94
## 1.969502 1.969502 1.969502 1.887894 1.841733
## Dept95 Dept96 Dept97 Dept98 Dept99
## 1.969502 1.722169 1.945883 1.884949 1.132346
## Temperature Fuel_Price CPI Unemployment
## 1.839111 1.840216 1.500770 1.177636
make prediction on weekly sales
preSales <- predict(multipleLM , newdata = sales_data_2012)
mean(preSales, na.rm = T)
## [1] 18705.25
mean(sales_data_2012$Weekly_Sales)
## [1] 15694.95
Need to check if there’s overfitting. Try feature engineering to improve accuracy.