store_data=Hackathon_Working_Data
store_valid=Hackathon_Validation_Data
head(store_data,10)
NA
NA
colSums(is.na(store_data))
MONTH STORECODE DAY BILL_ID BILL_AMT QTY VALUE
0 0 0 0 0 0 0
PRICE GRP SGRP SSGRP CMP MBRD BRD
0 0 0 0 0 0 0
It shows that there are no missing value in the data so we can mobve over to the descriptive analysis of the data.
summary(store_data)
MONTH STORECODE DAY
Length:26985 Length:26985 Min. : 1.00
Class :character Class :character 1st Qu.: 7.00
Mode :character Mode :character Median :14.00
Mean :15.17
3rd Qu.:23.00
Max. :31.00
BILL_ID BILL_AMT QTY
Length:26985 Min. : 0.0 Min. : 0.500
Class :character 1st Qu.: 40.0 1st Qu.: 1.000
Mode :character Median : 111.0 Median : 1.000
Mean : 278.8 Mean : 4.105
3rd Qu.: 280.0 3rd Qu.: 2.000
Max. :7292.0 Max. :12000.000
VALUE PRICE GRP
Min. : 0.00 Min. : 0.00 Length:26985
1st Qu.: 10.00 1st Qu.: 10.00 Class :character
Median : 30.00 Median : 22.00 Mode :character
Mean : 67.81 Mean : 52.81
3rd Qu.: 80.00 3rd Qu.: 64.00
Max. :3150.00 Max. :3150.00
SGRP SSGRP CMP
Length:26985 Length:26985 Length:26985
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
MBRD BRD
Length:26985 Length:26985
Class :character Class :character
Mode :character Mode :character
The above shown is the descriptive information about each feature (i.e central tendency of numeric features and count of categorical variables).
str(store_data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 26985 obs. of 14 variables:
$ MONTH : chr "M1" "M1" "M1" "M1" ...
$ STORECODE: chr "N1" "N1" "N1" "N1" ...
$ DAY : num 4 4 4 4 4 4 4 4 4 4 ...
$ BILL_ID : chr "T375" "T379" "T381" "T382" ...
$ BILL_AMT : num 225 95 10 108 19 109 109 25 25 25 ...
$ QTY : num 1 1 1 1 1 1 1 1 1 1 ...
$ VALUE : num 225 95 10 108 19 10 99 5 10 10 ...
$ PRICE : num 225 95 10 108 19 10 99 5 10 10 ...
$ GRP : chr "BUTTER MARGR (4/94)" "CONFECTIONERY - ECLAIRS" "CHOCOLATE" "PACKAGED TEA" ...
$ SGRP : chr "BUTTER" "CONFECTIONERY - ECLAIRS" "CHOCOLATE PANNED" "MAIN PACKS" ...
$ SSGRP : chr "SALTED" "CONFECTIONERY - ECLAIRS" "CHOCOLATE PANNED" "MAIN PACKS" ...
$ CMP : chr "G C M M F" "PARLE PRODS" "MONDELEZ INTERNATIONAL" "GUJ TEA PROCESSORS" ...
$ MBRD : chr "AMUL" "MELODY" "CADBURY SHOTS" "WAGH BAKRI" ...
$ BRD : chr "AMUL" "MELODY CHOCOLATY" "CADBURY SHOTS" "WAGH BAKRI INSTANT" ...
- attr(*, "spec")=List of 3
..$ cols :List of 14
.. ..$ MONTH : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ STORECODE: list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ DAY : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ BILL_ID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ BILL_AMT : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ QTY : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ VALUE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ PRICE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ GRP : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SGRP : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SSGRP : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ CMP : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ MBRD : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ BRD : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..$ skip : num 1
..- attr(*, "class")= chr "col_spec"
The above information signifies mainly about the class type of each variables and as we can see some of variables consists of data in string format we need to convert it into factor in order to use it for analysis purpose.
chrc=colnames(store_data[sapply(store_data,class)=="character"])
store_data[,chrc]=lapply(store_data[,chrc],as.factor)
Sales vs Month
library(ggplot2)
st=store_data[(store_data$MONTH=="M1"),]
st=as.data.frame.matrix(st)
MAX=colSums(st$VALUE)
library(dplyr)
ggplot(store_data,aes(MONTH,VALUE))+geom_bar(stat = "identity",fill="skyblue")+geom_label(data = store_data %>% filter(MONTH=="M2"),label=MAX,label.padding = unit(0.55, "lines"),
label.size = 0.35,col="black",fill="#69b3a2")+theme_bw()
NA
NA
NA
Above graph shows that highest sale was on Month M2.
Sales vs Day
It is very clear from the above graph that 1 week of the month is the time of peak sale.
In order to build a model to infer sales value we have to build a model for which data is needed to train the model and also to check the performance of the model we need a test data to measure the accuracy of predoiction of the model, to meet that purpose we split the data using catools library.
library(caTools)
set.seed(11)
splitz=sample.split(store_data$VALUE,SplitRatio = 0.75)
tr_store=subset(store_data,splitz=="TRUE")
ts_store=subset(store_data,splitz=="FALSE")
#Data preparing for xgboost algorithm
library(dplyr)
tr_store1=select(tr_store,-c("VALUE"))
ts_store1=select(ts_store,-c("VALUE"))
label_tr=tr_store$VALUE
lable_ts=ts_store$VALUE
#Converting Data into matrix
tr_store1=data.matrix(tr_store1)
ts_store1=data.matrix(ts_store1)
#Loading xgboost library
library(xgboost)
Attaching package: 㤼㸱xgboost㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
slice
#Converting train and test dat into xgb.dmatrix
dtrain=xgb.DMatrix(data = tr_store1,label=label_tr)
dtest=xgb.DMatrix(data = ts_store1,label=lable_ts)
#Setting parameters for model
params = list(booster = "gbtree", objective = "reg:linear", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
#calculating best value of nrounds using xgb.cv
xgbcv = xgb.cv( params = params, data = dtrain, nrounds =100, nfold = 5, showsd = T, stratified = T, print_every_n = 10, early_stop_round = 20, maximize = F)
[1] train-rmse:102.859344+1.644619 test-rmse:104.301829+8.793971
[11] train-rmse:8.178496+0.352855 test-rmse:22.362740+6.380684
[21] train-rmse:3.609056+0.331405 test-rmse:20.664499+5.838938
[31] train-rmse:2.813690+0.259298 test-rmse:20.449806+5.833832
[41] train-rmse:2.280538+0.237111 test-rmse:20.394627+5.847138
[51] train-rmse:1.903526+0.190269 test-rmse:20.355018+5.850336
[61] train-rmse:1.603461+0.174214 test-rmse:20.362537+5.829320
[71] train-rmse:1.366090+0.129508 test-rmse:20.353934+5.815968
[81] train-rmse:1.198732+0.116086 test-rmse:20.353267+5.841785
[91] train-rmse:1.021717+0.103120 test-rmse:20.350785+5.854212
[100] train-rmse:0.912785+0.082845 test-rmse:20.349273+5.855793
As we can see that the min rmse is at the 100 value so we will take nrounds as 100 for model building.
#model building
xgb1 = xgb.train (params = params, data = dtrain, nrounds = 100, watchlist = list(val=dtest,train=dtrain), print_every_n = 10, early_stop_round = 10, maximize = F , eval_metric = "rmse")
[1] val-rmse:82.972633 train-rmse:102.535957
[11] val-rmse:13.527576 train-rmse:8.792507
[21] val-rmse:12.989974 train-rmse:3.831362
[31] val-rmse:12.997304 train-rmse:3.191860
[41] val-rmse:12.929111 train-rmse:2.572822
[51] val-rmse:12.893428 train-rmse:2.121477
[61] val-rmse:12.828229 train-rmse:1.765125
[71] val-rmse:12.802021 train-rmse:1.520930
[81] val-rmse:12.785632 train-rmse:1.344225
[91] val-rmse:12.792033 train-rmse:1.140620
[100] val-rmse:12.771682 train-rmse:1.039576
Now we can predict the VALUE for test data and evaluate the accuracy of model.
#predicting test data
predict_store=predict(xgb1,dtest)
#Evaluating accuracy by rmse
RMSE(lable_ts,predict_store)
[1] "rmse of predicted and orignal data is = 12.7716943226327"
Now we can also see the importance of the variable in the terms of information gain.
from the above plot we can see that price qty bill_amt gives most of the information to predict the sales value.
Now as per our project now we have to predict the value for the provided validation data.Here we observe that validation data consist of only 4 variables,thus we need a new model to predict the sales value using these variables.
:
#xgbcv for dewtermining optimum value for nrounds
xgbcv1=xgb.cv(params = params1,data = dtrain1,nrounds = 100,nfold = 5,showsd = T,stratified = T,print_every_n = 10,early_stopping_rounds = 100,maximize = T)
[1] train-rmse:116.840094+2.721652 test-rmse:117.426261+13.024942
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 100 rounds.
[11] train-rmse:96.665439+3.043220 test-rmse:103.885532+12.234097
[21] train-rmse:95.211403+2.999960 test-rmse:104.309747+12.538476
[31] train-rmse:94.514597+3.085344 test-rmse:104.515683+12.105920
[41] train-rmse:94.101265+3.187785 test-rmse:104.815491+12.038717
[51] train-rmse:93.866852+3.177768 test-rmse:105.225247+11.935744
[61] train-rmse:93.700253+3.173371 test-rmse:105.485733+11.808685
[71] train-rmse:93.585498+3.181186 test-rmse:105.668302+11.720508
[81] train-rmse:93.484912+3.185968 test-rmse:105.851183+11.687715
[91] train-rmse:93.419533+3.192469 test-rmse:105.981120+11.658599
[100] train-rmse:93.370892+3.192582 test-rmse:106.093030+11.624056
We observe that at nrounds = 11 the rmse for test and train are minimum so we will use that value for model building.
xgb1 = xgb.train (params = params1, data = dtrain1, nrounds = 11, watchlist = list(val=dtest1,train=dtrain1), print_every_n = 10, early_stop_round = 10, maximize = F , eval_metric = "rmse")
[1] val-rmse:92.612694 train-rmse:117.165169
[11] val-rmse:84.392136 train-rmse:97.752678
Now we have to determine the important variable for model building and predicting the sales value.
imp1=xgb.importance(feature_names = colnames(dtrain1),model = xgb11)
xgb.plot.importance(importance_matrix = imp1,col2rgb(col = "skyblue",alpha = 0.7))
Here we can observe grp is the more significant than the other two variables for predicting sales value.
Before predicting the sales value for validation data we will check the performance of the model by predicting test data and evaluating through rmse.
#Making prediction
xgb_predict1=predict(xgb1,dtest1)
#Checking accuracy
RMSE(lable_ts1,xgb_predict1)
[1] "rmse of predicted and orignal data is = 84.392133260772"
Now we also have to prepare validation data before making predictions.
head(xgb_predict11)
[1] 64.42259 31.22036 80.81692 25.64476 25.64476 25.64476
xgb_predict11=data.frame(xgb_predict11)
colnames(xgb_predict11)="VALUE"
df=data.frame(ID=store_valid$ID,VALUE=xgb_predict11)
df
write.csv(df,"C:/Users/ajesh/Desktop/prediction11.csv",row.names = FALSE)