Loading the files into R.

train<-read.csv("F:/Data science/practice/Finished/Food Demand Forecasting Challage/train.csv")
test<-read.csv("F:/Data science/practice/Finished/Food Demand Forecasting Challage/test.csv")

Loading the libraires.

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin

Looking at the files.

dim(test)
## [1] 32573     8
dim(train)
## [1] 456548      9
names(test)
## [1] "id"                    "week"                  "center_id"            
## [4] "meal_id"               "checkout_price"        "base_price"           
## [7] "emailer_for_promotion" "homepage_featured"
names(train)
## [1] "id"                    "week"                  "center_id"            
## [4] "meal_id"               "checkout_price"        "base_price"           
## [7] "emailer_for_promotion" "homepage_featured"     "num_orders"

Train

test$num_orders<-NA
names(test)
## [1] "id"                    "week"                  "center_id"            
## [4] "meal_id"               "checkout_price"        "base_price"           
## [7] "emailer_for_promotion" "homepage_featured"     "num_orders"

Removing the outlier form train that we have found later.

max(train$num_orders)
## [1] 24299
train<-filter(train,train$num_orders!=24299)
max(train$num_orders)
## [1] 15336

Making a column set, for identifying the original dataset.

test$set<-"Test"
train$set<-"Train"

Combining the data of train and test for a big dataset Full.

full<-rbind(train,test)
dim(full)
## [1] 489120     10

Converting full into tbl_df

full<-tbl_df(full)

Looking at full

names(full)
##  [1] "id"                    "week"                 
##  [3] "center_id"             "meal_id"              
##  [5] "checkout_price"        "base_price"           
##  [7] "emailer_for_promotion" "homepage_featured"    
##  [9] "num_orders"            "set"
class(full$num_orders)
## [1] "integer"

Checking for missing values.

missing_values<-summarise_all(full,funs(sum(is.na(.))/n()))
missing_values<-gather(missing_values,key="feature",value="MissingPoints")

Plotting the missing values.

g<-ggplot(data=missing_values,aes(x=reorder(feature,-MissingPoints),y=MissingPoints))
g<-g+geom_bar(stat="identity")+coord_flip()
g

Converting Emailer for promotion as a factor as it is only of two values.

full$emailer_for_promotion<-factor(full$emailer_for_promotion,levels = c(0,1))

Homepage features also

full$homepage_featured<-factor(full$homepage_featured,levels = c(0,1))

Only number of orders has missing values,6%. because of the test values.

names(full)
##  [1] "id"                    "week"                 
##  [3] "center_id"             "meal_id"              
##  [5] "checkout_price"        "base_price"           
##  [7] "emailer_for_promotion" "homepage_featured"    
##  [9] "num_orders"            "set"

Visualizing

Number of orders.

g<-ggplot(data=full)
g<-g+geom_point(aes(y=full$num_orders,x=full$id))
g
## Warning: Removed 32573 rows containing missing values (geom_point).

##Emailer for promotion and number of orders.

g<-ggplot(data=full)
g<-g+geom_bar(stat="identity",aes(x=full$emailer_for_promotion,y=full$num_orders,fill=full$emailer_for_promotion))
g<-g+xlab("Emailer For promotion")+ylab("Number of orders")+theme(legend.position = "none")
g
## Warning: Removed 32573 rows containing missing values (position_stack).

Week and Number of orders.

g<-ggplot(data=train)
g<-g+geom_point(aes(x=train$week,y=train$num_orders,fill=train$week))
g<-g+xlab("Week")+ylab("Number of orders")+theme(legend.position="none")
g

There is no solid relation between Week and Number of orders.

Between Base price and Numbe or orders.

g<-ggplot(data=train)
g<-g+geom_point(aes(x=train$base_price,y=train$num_orders))
g

Between Checkout price and Number of orders.

g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$checkout_price,y=full$num_orders))
g
## Warning: Removed 32573 rows containing missing values (geom_point).

Between Meal_id and Number of orders.

g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$meal_id,y=full$num_orders))
g
## Warning: Removed 32573 rows containing missing values (geom_point).

Between Center id and number of orders.

g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$center_id,y=full$num_orders))
g
## Warning: Removed 32573 rows containing missing values (geom_point).

No relation

Cross validation

c<-sample.int(n=nrow(train),size=0.6*nrow(train),replace = F)
train_1<-train[c,]
test_1<-train[-c,]

Using Decision Trees.

dt<-rpart(num_orders~meal_id+checkout_price+base_price+emailer_for_promotion+homepage_featured,data=train_1)
rpart.plot(dt,fallen.leaves = F)

Using Random Forest

rf<-randomForest(num_orders~meal_id+checkout_price+base_price+emailer_for_promotion+homepage_featured,data=train_1,ntree=50)

Looking at the model.

summary(dt)
## Call:
## rpart(formula = num_orders ~ meal_id + checkout_price + base_price + 
##     emailer_for_promotion + homepage_featured, data = train_1)
##   n= 273928 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.08738626      0 1.0000000 1.0000039 0.01863718
## 2 0.05941021      1 0.9126137 0.9126451 0.01733136
## 3 0.04662786      2 0.8532035 0.8532955 0.01637541
## 4 0.03534945      3 0.8065757 0.8066687 0.01624136
## 5 0.02132553      4 0.7712262 0.7713713 0.01525086
## 6 0.01216105      6 0.7285752 0.7289274 0.01333873
## 7 0.01126135      7 0.7164141 0.7167985 0.01291864
## 8 0.01079270      9 0.6938914 0.6970486 0.01275121
## 9 0.01000000     14 0.6399279 0.6410112 0.01265038
## 
## Variable importance
##        checkout_price            base_price               meal_id 
##                    25                    24                    23 
##     homepage_featured emailer_for_promotion 
##                    17                    11 
## 
## Node number 1: 273928 observations,    complexity param=0.08738626
##   mean=261.6252, MSE=155258.3 
##   left son=2 (244050 obs) right son=3 (29878 obs)
##   Primary splits:
##       homepage_featured     < 0.5     to the left,  improve=0.08738626, (0 missing)
##       checkout_price        < 279.875 to the right, improve=0.07773662, (0 missing)
##       emailer_for_promotion < 0.5     to the left,  improve=0.07675609, (0 missing)
##       base_price            < 335.165 to the right, improve=0.06538558, (0 missing)
##       meal_id               < 1153.5  to the right, improve=0.01831642, (0 missing)
##   Surrogate splits:
##       emailer_for_promotion < 0.5     to the left,  agree=0.894, adj=0.026, (0 split)
## 
## Node number 2: 244050 observations,    complexity param=0.04662786
##   mean=220.8698, MSE=84590.39 
##   left son=4 (90149 obs) right son=5 (153901 obs)
##   Primary splits:
##       checkout_price        < 335.165 to the right, improve=0.09605872, (0 missing)
##       base_price            < 335.165 to the right, improve=0.09471307, (0 missing)
##       meal_id               < 1153.5  to the right, improve=0.03402886, (0 missing)
##       emailer_for_promotion < 0.5     to the left,  improve=0.02413694, (0 missing)
##   Surrogate splits:
##       base_price < 335.165 to the right, agree=0.979, adj=0.942, (0 split)
##       meal_id    < 2846.5  to the right, agree=0.656, adj=0.068, (0 split)
## 
## Node number 3: 29878 observations,    complexity param=0.05941021
##   mean=594.5243, MSE=608099.6 
##   left son=6 (14980 obs) right son=7 (14898 obs)
##   Primary splits:
##       checkout_price        < 274.055 to the right, improve=0.13906760, (0 missing)
##       base_price            < 336.15  to the right, improve=0.08808724, (0 missing)
##       emailer_for_promotion < 0.5     to the left,  improve=0.04917462, (0 missing)
##       meal_id               < 2297    to the right, improve=0.03159082, (0 missing)
##   Surrogate splits:
##       base_price            < 350.685 to the right, agree=0.899, adj=0.797, (0 split)
##       meal_id               < 1649    to the left,  agree=0.595, adj=0.187, (0 split)
##       emailer_for_promotion < 0.5     to the right, agree=0.554, adj=0.105, (0 split)
## 
## Node number 4: 90149 observations
##   mean=103.0904, MSE=16902.86 
## 
## Node number 5: 153901 observations,    complexity param=0.01216105
##   mean=289.8602, MSE=111353.7 
##   left son=10 (147774 obs) right son=11 (6127 obs)
##   Primary splits:
##       emailer_for_promotion < 0.5     to the left,  improve=0.03017978, (0 missing)
##       meal_id               < 1153.5  to the right, improve=0.02080238, (0 missing)
##       checkout_price        < 241.075 to the right, improve=0.01565182, (0 missing)
##       base_price            < 151.835 to the right, improve=0.01102966, (0 missing)
## 
## Node number 6: 14980 observations
##   mean=304.5173, MSE=106740.3 
## 
## Node number 7: 14898 observations,    complexity param=0.03534945
##   mean=886.1275, MSE=942619.1 
##   left son=14 (9982 obs) right son=15 (4916 obs)
##   Primary splits:
##       emailer_for_promotion < 0.5     to the left,  improve=0.10705570, (0 missing)
##       meal_id               < 2297    to the right, improve=0.06996197, (0 missing)
##       base_price            < 292.485 to the left,  improve=0.05129185, (0 missing)
##       checkout_price        < 119.825 to the left,  improve=0.02493541, (0 missing)
##   Surrogate splits:
##       base_price     < 292.485 to the left,  agree=0.735, adj=0.197, (0 split)
##       meal_id        < 1153.5  to the right, agree=0.690, adj=0.061, (0 split)
##       checkout_price < 265.325 to the left,  agree=0.677, adj=0.021, (0 split)
## 
## Node number 10: 147774 observations,    complexity param=0.0107927
##   mean=278.056, MSE=82629.92 
##   left son=20 (135701 obs) right son=21 (12073 obs)
##   Primary splits:
##       meal_id        < 1153.5  to the right, improve=0.02932473, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.01879387, (0 missing)
##       base_price     < 151.835 to the right, improve=0.01794499, (0 missing)
## 
## Node number 11: 6127 observations
##   mean=574.5588, MSE=719713.8 
## 
## Node number 14: 9982 observations,    complexity param=0.01126135
##   mean=663.1967, MSE=447604.9 
##   left son=28 (3856 obs) right son=29 (6126 obs)
##   Primary splits:
##       meal_id        < 2298    to the right, improve=0.09244895, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.07078568, (0 missing)
##       base_price     < 151.835 to the right, improve=0.06617691, (0 missing)
##   Surrogate splits:
##       base_price     < 216.795 to the right, agree=0.656, adj=0.110, (0 split)
##       checkout_price < 192.575 to the right, agree=0.645, adj=0.081, (0 split)
## 
## Node number 15: 4916 observations,    complexity param=0.02132553
##   mean=1338.791, MSE=1641934 
##   left son=30 (1868 obs) right son=31 (3048 obs)
##   Primary splits:
##       meal_id        < 2297    to the right, improve=0.07878209, (0 missing)
##       base_price     < 292.485 to the left,  improve=0.03615819, (0 missing)
##       checkout_price < 119.795 to the left,  improve=0.03465867, (0 missing)
##   Surrogate splits:
##       base_price < 256.595 to the left,  agree=0.648, adj=0.073, (0 split)
## 
## Node number 20: 135701 observations,    complexity param=0.0107927
##   mean=263.3735, MSE=80019.61 
##   left son=40 (23537 obs) right son=41 (112164 obs)
##   Primary splits:
##       meal_id        < 1635    to the left,  improve=0.02802507, (0 missing)
##       base_price     < 151.835 to the right, improve=0.02638343, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.01853121, (0 missing)
##   Surrogate splits:
##       base_price     < 457.4   to the right, agree=0.827, adj=0.003, (0 split)
##       checkout_price < 80.555  to the left,  agree=0.827, adj=0.001, (0 split)
## 
## Node number 21: 12073 observations
##   mean=443.0885, MSE=82311.11 
## 
## Node number 28: 3856 observations
##   mean=406.7964, MSE=167184.7 
## 
## Node number 29: 6126 observations,    complexity param=0.01126135
##   mean=824.5873, MSE=556687.4 
##   left son=58 (1641 obs) right son=59 (4485 obs)
##   Primary splits:
##       meal_id        < 1635    to the left,  improve=0.15975880, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.05329010, (0 missing)
##       base_price     < 151.835 to the right, improve=0.04718472, (0 missing)
##   Surrogate splits:
##       checkout_price < 241.075 to the right, agree=0.801, adj=0.255, (0 split)
##       base_price     < 185.315 to the right, agree=0.781, adj=0.181, (0 split)
## 
## Node number 30: 1868 observations
##   mean=879.3704, MSE=593239.5 
## 
## Node number 31: 3048 observations,    complexity param=0.02132553
##   mean=1620.352, MSE=2076007 
##   left son=62 (2732 obs) right son=63 (316 obs)
##   Primary splits:
##       meal_id        < 2141.5  to the left,  improve=0.18617010, (0 missing)
##       base_price     < 292.485 to the left,  improve=0.05194058, (0 missing)
##       checkout_price < 119.795 to the left,  improve=0.04417040, (0 missing)
## 
## Node number 40: 23537 observations
##   mean=159.9967, MSE=26910.8 
## 
## Node number 41: 112164 observations,    complexity param=0.0107927
##   mean=285.0665, MSE=88451.05 
##   left son=82 (64359 obs) right son=83 (47805 obs)
##   Primary splits:
##       meal_id        < 2297    to the right, improve=0.03282895, (0 missing)
##       base_price     < 151.835 to the right, improve=0.02887008, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.02580156, (0 missing)
##   Surrogate splits:
##       base_price     < 214.885 to the right, agree=0.633, adj=0.139, (0 split)
##       checkout_price < 185.785 to the right, agree=0.631, adj=0.135, (0 split)
## 
## Node number 58: 1641 observations
##   mean=331.5667, MSE=117781 
## 
## Node number 59: 4485 observations
##   mean=1004.977, MSE=595801.1 
## 
## Node number 62: 2732 observations
##   mean=1408.919, MSE=1334063 
## 
## Node number 63: 316 observations
##   mean=3448.31, MSE=4762616 
## 
## Node number 82: 64359 observations
##   mean=238.6244, MSE=58761.79 
## 
## Node number 83: 47805 observations,    complexity param=0.0107927
##   mean=347.5907, MSE=121608.1 
##   left son=166 (41889 obs) right son=167 (5916 obs)
##   Primary splits:
##       meal_id        < 2214.5  to the left,  improve=0.10749530, (0 missing)
##       base_price     < 153.275 to the right, improve=0.07608897, (0 missing)
##       checkout_price < 153.275 to the right, improve=0.07152240, (0 missing)
## 
## Node number 166: 41889 observations,    complexity param=0.0107927
##   mean=304.6232, MSE=104994.7 
##   left son=332 (32541 obs) right son=333 (9348 obs)
##   Primary splits:
##       base_price     < 153.275 to the right, improve=0.15507530, (0 missing)
##       checkout_price < 153.275 to the right, improve=0.15081430, (0 missing)
##       meal_id        < 2066    to the right, improve=0.07975996, (0 missing)
##   Surrogate splits:
##       checkout_price < 152.805 to the right, agree=0.98, adj=0.909, (0 split)
## 
## Node number 167: 5916 observations
##   mean=651.8276, MSE=133609.6 
## 
## Node number 332: 32541 observations
##   mean=236.2322, MSE=54585.77 
## 
## Node number 333: 9348 observations
##   mean=542.6966, MSE=207510.3

Predicting for train1 using DT.

train_1$pred<-rpart.predict(dt,data=train_1,type="vector")
head(select(train_1,c(pred,num_orders)))
##            pred num_orders
## 145464 238.6244        136
## 97364  103.0904        175
## 47312  574.5588        190
## 91432  103.0904         28
## 335394 238.6244        486
## 275856 304.5173         81

Predicting on Train1 using RF.

train_1$predRF<-predict(rf,data=train_1)
head(select(train_1,c(predRF,num_orders)))
##          predRF num_orders
## 145464 355.8383        136
## 97364  122.1820        175
## 47312  342.4670        190
## 91432  121.7840         28
## 335394 259.3124        486
## 275856 278.1963         81

Predeicting for test_1 Using DT.

test_1$pred<-predict(dt,newdata = test_1)

Predicting for test_1 using RF.

test_1$predRF<-predict(rf,newdata = test_1)

Counting Residuals DT.

e<-test_1$num_orders-test_1$pred
length(e)
## [1] 182619
e<-data.frame(row=1:182619,resid=e)
summary(e$resid)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -3069.310  -129.232   -56.697    -0.843    58.910 13224.023

Counting Residuals RF.

rfE<-test_1$num_orders-test_1$predRF
rfE<-data.frame(row=1:182619,resid=rfE)
summary(rfE)
##       row             resid          
##  Min.   :     1   Min.   :-1185.807  
##  1st Qu.: 45656   1st Qu.: -138.481  
##  Median : 91310   Median :  -69.047  
##  Mean   : 91310   Mean   :   -0.421  
##  3rd Qu.:136965   3rd Qu.:   44.909  
##  Max.   :182619   Max.   :13235.620

Comparing both residual.

par(mfrow=c(1,2))

Plotting residuals DT.

boxplot(e$resid)

Plotting Residuals Rf.

boxplot(rfE$resid)

Predicting on test data using DT.

submission<-predict(dt,newdata = test)
sub_df<-data.frame(id=test$id,num_orders=submission)
head(sub_df)
##        id num_orders
## 1 1028232   236.2322
## 2 1127204   236.2322
## 3 1212707   238.6244
## 4 1082698   238.6244
## 5 1400926   159.9967
## 6 1284113   236.2322

Predicting on test using RF.

sub<-predict(rf,newdata = test)
sub_df_1<-data.frame(id=test$id,num_orders=sub)

Making a csv file.

write.csv(sub_df,"submission2.csv",row.names = F)
write.csv(sub_df_1,"submission3.csv",row.names = F)