It’s a sales data of dummy customers, objective is to find the best way to predict the Order Qty in by the Customers inormation. We have Following Cusomer info:
+ Cust: Customer Number + Brand : Category or Brand of the Product + Qty : Quanitiy Sold in the give Quarter + Promo : Promotion Applied to the Order + State : State from USA + Qtr : quarterly Data tag with Yeat + Qtrr : quarterly Data tag + Year : Year
dataMkt <- read_excel(path=paste0("mkt_Data.xlsx")) %>% .[,c(33,9,12,15,20,22,35,34)]
names(dataMkt) <- c("Cust","Brand","Qty","Promo","State","Qtr","Year","Qtrr")
dataMkt$Year <- as.character(dataMkt$Year)
dataMkt$Cust <- as.character(dataMkt$Cust)
head(dataMkt)
Groupping the SALES DATA by each Quarter Customer.
lm_cust <- dataMkt %>% group_by(Cust,Qtr,Brand,Promo,State,Year,Qtrr) %>% summarise(Qty = sum(Qty))
describe(lm_cust)
## Warning in describe(lm_cust): NAs introduced by coercion
## Warning in describe(lm_cust): NAs introduced by coercion
## Warning in describe(lm_cust): NAs introduced by coercion
## Warning in describe(lm_cust): NAs introduced by coercion
## Warning in describe(lm_cust): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
Try Predicting Quantity based on Known informaiton . Identifying Best model to predict Quantity .
lm_cust %>%
ggplot(mapping = aes(x=Brand, y = Qty,fill = Qtr)) +
geom_col()+facet_grid(Year~ .)+
theme(axis.text.x = element_text(angle = 70, hjust = 1)) +
scale_y_continuous( labels = scales::number)+
ggtitle("Brand by Year ") +ylab("Ordered Unit")
lm_cust %>%
ggplot(mapping = aes(x=Cust, y = jitter(Qty))) +
geom_point()+
theme(axis.text.x = element_text(angle = 70, hjust = 1)) +
scale_y_continuous( labels = scales::number)+
ggtitle("Brand by Year ") +ylab("Ordered Unit")
lm_cust %>%
ggplot(mapping = aes(x=Cust, y = Qty,fill = Qtr)) +
geom_col()+
theme(axis.text.x = element_text(angle = 70, hjust = 1)) +
scale_y_continuous( labels = scales::number)+
ggtitle("Brand by Year ") +ylab("Ordered Unit")
Creating a sample of 200 records and then We will see if we can Build a model to predict Qty by other known factors. We will follow backword appraoch.
# Result Table
result <- data.frame(var = 1:10,pval = 1:10, comment=1:10)
result$var <- "A"
result$pval <- "A"
result$comment <- "A"
## Ready for Model
set.seed(42672)
lm_cust_s1 <- lm_cust[sample(nrow(lm_cust),200),]
lm1 <- lm(Qty ~ Qtrr + Brand + Promo + State,lm_cust_s1)
summary(lm1)
##
## Call:
## lm(formula = Qty ~ Qtrr + Brand + Promo + State, data = lm_cust_s1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.819 -7.064 -1.331 4.621 109.723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.1903 25.5678 0.477 0.63428
## QtrrQ2 -9.6299 5.5472 -1.736 0.08481 .
## QtrrQ3 -12.5274 8.1055 -1.546 0.12452
## QtrrQ4 -14.7938 7.9852 -1.853 0.06608 .
## BrandAX 0.8196 23.9381 0.034 0.97274
## BrandBB -9.1260 27.4966 -0.332 0.74047
## BrandBE 2.6453 22.7250 0.116 0.90750
## BrandBV -28.5377 31.3228 -0.911 0.36385
## BrandCH 7.5324 21.2010 0.355 0.72292
## BrandDG 4.5916 21.5843 0.213 0.83185
## BrandDY 9.2029 27.6848 0.332 0.74008
## BrandEA -2.8370 22.7429 -0.125 0.90091
## BrandHC 2.4028 20.0190 0.120 0.90464
## BrandMK 7.6045 19.8830 0.382 0.70271
## BrandOJ -29.6001 23.4077 -1.265 0.20818
## BrandOO 1.9130 20.0811 0.095 0.92425
## BrandOX 12.1262 20.5952 0.589 0.55697
## BrandOY 2.7836 20.7103 0.134 0.89328
## BrandPH -6.3724 20.8978 -0.305 0.76088
## BrandPP -6.4604 24.0029 -0.269 0.78822
## BrandPR 4.9872 20.6925 0.241 0.80991
## BrandPS -30.4702 28.0328 -1.087 0.27897
## BrandRA -11.3147 22.7603 -0.497 0.61990
## BrandRB 14.1610 19.7887 0.716 0.47545
## BrandRJ -18.5538 29.6137 -0.627 0.53201
## BrandRX 11.5364 19.6492 0.587 0.55809
## BrandRY -8.4432 20.3798 -0.414 0.67931
## BrandTF 10.6164 22.0164 0.482 0.63043
## BrandTY 15.7957 22.0772 0.715 0.47553
## BrandVE 2.0631 21.2448 0.097 0.92278
## BrandVO -1.5657 22.0143 -0.071 0.94340
## PromoH15 7.8808 11.0910 0.711 0.47856
## PromoHC18 47.4700 21.8053 2.177 0.03119 *
## PromoLUX12 3.3146 11.8530 0.280 0.78018
## PromoLUX20 11.8567 11.4721 1.034 0.30318
## PromoLUX24 13.4856 23.0946 0.584 0.56023
## PromoLUX40 22.3993 11.3079 1.981 0.04961 *
## PromoNASC75 18.8557 16.6408 1.133 0.25915
## PromoOOX40 24.8739 18.4787 1.346 0.18050
## PromoOther 12.5330 11.3643 1.103 0.27203
## PromoOY12 8.0700 18.2033 0.443 0.65823
## PromoPB1 1.8279 21.6845 0.084 0.93294
## PromoPB10 0.3381 20.3775 0.017 0.98679
## PromoPB2 54.0845 18.2090 2.970 0.00352 **
## PromoPB6 -6.7787 17.1785 -0.395 0.69375
## PromoR-PB10 -12.7955 21.1503 -0.605 0.54619
## PromoR-PB15 -11.3565 15.3445 -0.740 0.46051
## PromoRB15 12.2947 15.4961 0.793 0.42891
## PromoRBRX30 7.5800 21.2966 0.356 0.72245
## PromoSY1 17.6224 13.8395 1.273 0.20505
## PromoSY100 22.6608 12.1525 1.865 0.06436 .
## PromoSY200 27.6843 17.7794 1.557 0.12175
## PromoUP1 13.7577 13.5856 1.013 0.31300
## PromoUP3 6.4307 12.6063 0.510 0.61079
## PromoVPFP200 11.3610 18.2192 0.624 0.53394
## PromoWC1 31.3733 13.9749 2.245 0.02637 *
## StateAL -4.9313 14.0264 -0.352 0.72570
## StateAZ -8.3419 15.4916 -0.538 0.59112
## StateCA -7.5266 14.2044 -0.530 0.59705
## StateFL -7.2924 14.4786 -0.504 0.61530
## StateIN -7.0913 15.4281 -0.460 0.64651
## StateMS -1.3027 17.5012 -0.074 0.94077
## StateNY -10.0934 14.8678 -0.679 0.49836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.25 on 137 degrees of freedom
## Multiple R-squared: 0.3341, Adjusted R-squared: 0.03278
## F-statistic: 1.109 on 62 and 137 DF, p-value: 0.3064
anova(lm1)
result$var[1] <- "lm(formula = Order_Qty ~ Qtrr + Brand + Promo + State,
data = lm_cust_s1)"
result$pval[1] <- "Multiple R-squared: 0.3341, Adjusted R-squared: 0.03278
F-statistic: 1.109 on 62 and 137 DF, p-value: 0.3064"
result$comment[1] <- "Qtr and some Promo codes are significant,state is not "
As we can see that we with this model moslty all of the Coefficients are irrelevent excpet Quater and Few Promotions. Adjusted R Square is not great at all with only 5% of data explain.
we need to think how we can improve this.
#Dropping State
lm2 <- lm(Qty ~ Qtrr + Brand + Promo ,lm_cust_s1)
summary(lm2)
##
## Call:
## lm(formula = Qty ~ Qtrr + Brand + Promo, data = lm_cust_s1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.691 -6.599 -1.339 4.526 111.590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7758 20.7932 0.182 0.85616
## QtrrQ2 -10.5933 5.2931 -2.001 0.04723 *
## QtrrQ3 -14.4997 7.6925 -1.885 0.06146 .
## QtrrQ4 -16.2344 7.6284 -2.128 0.03503 *
## BrandAX 4.2542 22.8316 0.186 0.85245
## BrandBB -3.1638 26.1208 -0.121 0.90376
## BrandBE 5.4900 21.4896 0.255 0.79872
## BrandBV -24.8512 29.6184 -0.839 0.40283
## BrandCH 9.0667 20.6654 0.439 0.66151
## BrandDG 7.2500 20.5860 0.352 0.72522
## BrandDY 13.1015 26.1218 0.502 0.61675
## BrandEA 0.2452 21.5875 0.011 0.99095
## BrandHC 6.4658 19.1070 0.338 0.73556
## BrandMK 11.0429 18.9608 0.582 0.56120
## BrandOJ -25.4426 22.5096 -1.130 0.26023
## BrandOO 6.6622 18.8854 0.353 0.72478
## BrandOX 16.3335 19.4047 0.842 0.40134
## BrandOY 7.9792 19.4580 0.410 0.68236
## BrandPH -1.6450 19.6655 -0.084 0.93345
## BrandPP -2.5919 22.8117 -0.114 0.90970
## BrandPR 8.1317 19.5860 0.415 0.67863
## BrandPS -27.9053 27.1089 -1.029 0.30503
## BrandRA -6.6589 21.4086 -0.311 0.75622
## BrandRB 18.2302 18.6905 0.975 0.33101
## BrandRJ -11.8320 28.0388 -0.422 0.67366
## BrandRX 16.0544 18.5665 0.865 0.38864
## BrandRY -4.5238 19.3259 -0.234 0.81526
## BrandTF 15.1949 20.7155 0.734 0.46444
## BrandTY 22.0103 20.5828 1.069 0.28670
## BrandVE 5.1585 20.3770 0.253 0.80051
## BrandVO 2.6885 20.6919 0.130 0.89680
## PromoH15 5.2247 10.2017 0.512 0.60933
## PromoHC18 43.6590 20.7815 2.101 0.03740 *
## PromoLUX12 1.2117 11.3585 0.107 0.91519
## PromoLUX20 9.4038 10.9043 0.862 0.38990
## PromoLUX24 10.0293 22.3261 0.449 0.65395
## PromoLUX40 20.6853 10.9123 1.896 0.06002 .
## PromoNASC75 18.9071 16.2702 1.162 0.24713
## PromoOOX40 22.0754 17.6624 1.250 0.21338
## PromoOther 11.8175 10.8371 1.090 0.27733
## PromoOY12 5.5416 17.3815 0.319 0.75032
## PromoPB1 0.5620 21.0517 0.027 0.97874
## PromoPB10 -0.2416 19.9734 -0.012 0.99037
## PromoPB2 53.1144 17.6877 3.003 0.00315 **
## PromoPB6 -7.7736 16.6010 -0.468 0.64030
## PromoR-PB10 -13.8302 20.6794 -0.669 0.50470
## PromoR-PB15 -13.6727 14.8166 -0.923 0.35766
## PromoRB15 9.0562 14.6341 0.619 0.53700
## PromoRBRX30 6.9940 20.8116 0.336 0.73731
## PromoSY1 17.9077 13.4496 1.331 0.18514
## PromoSY100 21.6224 11.6372 1.858 0.06520 .
## PromoSY200 22.6752 16.3075 1.390 0.16653
## PromoUP1 11.0581 12.6546 0.874 0.38366
## PromoUP3 5.7187 11.4280 0.500 0.61755
## PromoVPFP200 7.0523 17.0788 0.413 0.68028
## PromoWC1 27.1295 12.4482 2.179 0.03093 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.9 on 144 degrees of freedom
## Multiple R-squared: 0.327, Adjusted R-squared: 0.06998
## F-statistic: 1.272 on 55 and 144 DF, p-value: 0.1307
anova(lm2)
result$var[2] <- "lm(formula =Qtrr + Brand + Promo, data = lm_cust_s1)"
result$pval[2] <- "Multiple R-squared: 0.327, Adjusted R-squared: 0.06998
F-statistic: 1.272 on 55 and 144 DF, p-value: 0.1307"
result$comment[2] <- "(Month)Brand is not significant"
#Promo promoVPFP200 turns out to more significant here.
Now Grouping the data form DIfferent Brand together to see if that has any pattern in it.
#lm_cust_s12<- mutate(lm_cust_s1,Place = if_else(str_detect(Brand,"/*O|/*R[A-Z]") , 1 , 0))
lm_cust_s12<- mutate(lm_cust_s1,Place = case_when(str_detect(Brand,"/*O[A-Z]") ~ 3,
str_detect(Brand,"/*R[A-Z]") ~ 1,
TRUE ~ 2))
head(lm_cust_s12)
#3 Dropping brand info
lm3 <- lm(Qty ~ Qtrr + Brand + Promo + Place,lm_cust_s12)
summary(lm3)
##
## Call:
## lm(formula = Qty ~ Qtrr + Brand + Promo + Place, data = lm_cust_s12)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.691 -6.599 -1.339 4.526 111.590
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7758 20.7932 0.182 0.85616
## QtrrQ2 -10.5933 5.2931 -2.001 0.04723 *
## QtrrQ3 -14.4997 7.6925 -1.885 0.06146 .
## QtrrQ4 -16.2344 7.6284 -2.128 0.03503 *
## BrandAX 4.2542 22.8316 0.186 0.85245
## BrandBB -3.1638 26.1208 -0.121 0.90376
## BrandBE 5.4900 21.4896 0.255 0.79872
## BrandBV -24.8512 29.6184 -0.839 0.40283
## BrandCH 9.0667 20.6654 0.439 0.66151
## BrandDG 7.2500 20.5860 0.352 0.72522
## BrandDY 13.1015 26.1218 0.502 0.61675
## BrandEA 0.2452 21.5875 0.011 0.99095
## BrandHC 6.4658 19.1070 0.338 0.73556
## BrandMK 11.0429 18.9608 0.582 0.56120
## BrandOJ -25.4426 22.5096 -1.130 0.26023
## BrandOO 6.6622 18.8854 0.353 0.72478
## BrandOX 16.3335 19.4047 0.842 0.40134
## BrandOY 7.9792 19.4580 0.410 0.68236
## BrandPH -1.6450 19.6655 -0.084 0.93345
## BrandPP -2.5919 22.8117 -0.114 0.90970
## BrandPR 8.1317 19.5860 0.415 0.67863
## BrandPS -27.9053 27.1089 -1.029 0.30503
## BrandRA -6.6589 21.4086 -0.311 0.75622
## BrandRB 18.2302 18.6905 0.975 0.33101
## BrandRJ -11.8320 28.0388 -0.422 0.67366
## BrandRX 16.0544 18.5665 0.865 0.38864
## BrandRY -4.5238 19.3259 -0.234 0.81526
## BrandTF 15.1949 20.7155 0.734 0.46444
## BrandTY 22.0103 20.5828 1.069 0.28670
## BrandVE 5.1585 20.3770 0.253 0.80051
## BrandVO 2.6885 20.6919 0.130 0.89680
## PromoH15 5.2247 10.2017 0.512 0.60933
## PromoHC18 43.6590 20.7815 2.101 0.03740 *
## PromoLUX12 1.2117 11.3585 0.107 0.91519
## PromoLUX20 9.4038 10.9043 0.862 0.38990
## PromoLUX24 10.0293 22.3261 0.449 0.65395
## PromoLUX40 20.6853 10.9123 1.896 0.06002 .
## PromoNASC75 18.9071 16.2702 1.162 0.24713
## PromoOOX40 22.0754 17.6624 1.250 0.21338
## PromoOther 11.8175 10.8371 1.090 0.27733
## PromoOY12 5.5416 17.3815 0.319 0.75032
## PromoPB1 0.5620 21.0517 0.027 0.97874
## PromoPB10 -0.2416 19.9734 -0.012 0.99037
## PromoPB2 53.1144 17.6877 3.003 0.00315 **
## PromoPB6 -7.7736 16.6010 -0.468 0.64030
## PromoR-PB10 -13.8302 20.6794 -0.669 0.50470
## PromoR-PB15 -13.6727 14.8166 -0.923 0.35766
## PromoRB15 9.0562 14.6341 0.619 0.53700
## PromoRBRX30 6.9940 20.8116 0.336 0.73731
## PromoSY1 17.9077 13.4496 1.331 0.18514
## PromoSY100 21.6224 11.6372 1.858 0.06520 .
## PromoSY200 22.6752 16.3075 1.390 0.16653
## PromoUP1 11.0581 12.6546 0.874 0.38366
## PromoUP3 5.7187 11.4280 0.500 0.61755
## PromoVPFP200 7.0523 17.0788 0.413 0.68028
## PromoWC1 27.1295 12.4482 2.179 0.03093 *
## Place NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.9 on 144 degrees of freedom
## Multiple R-squared: 0.327, Adjusted R-squared: 0.06998
## F-statistic: 1.272 on 55 and 144 DF, p-value: 0.1307
anova(lm3)
result$var[3] <- "lm(formula = Order_Qty ~ Qtrr + Brand + Promo + Place, data = lm_cust_s12)"
result$pval[3] <- "Multiple R-squared: 0.327, Adjusted R-squared: 0.06998
F-statistic: 1.272 on 55 and 144 DF, p-value: 0.1307"
result$comment[3] <- " "