Loading the files into R.

train<-read.csv("F:/Data science/practice/Finished/Food Demand Forecasting Challage/train.csv")
test<-read.csv("F:/Data science/practice/Finished/Food Demand Forecasting Challage/test.csv")

Loading the libraires.

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(rpart)
library(rpart.plot)
library(caret)

## Loading required package: lattice

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

Looking at the files.

dim(test)

## [1] 32573     8

dim(train)

## [1] 456548      9

names(test)

## [1] "id"                    "week"                  "center_id"            
## [4] "meal_id"               "checkout_price"        "base_price"           
## [7] "emailer_for_promotion" "homepage_featured"

names(train)

## [1] "id"                    "week"                  "center_id"            
## [4] "meal_id"               "checkout_price"        "base_price"           
## [7] "emailer_for_promotion" "homepage_featured"     "num_orders"

Train

test$num_orders<-NA
names(test)

## [1] "id"                    "week"                  "center_id"            
## [4] "meal_id"               "checkout_price"        "base_price"           
## [7] "emailer_for_promotion" "homepage_featured"     "num_orders"

Removing the outlier form train that we have found later.

max(train$num_orders)

## [1] 24299

train<-filter(train,train$num_orders!=24299)
max(train$num_orders)

## [1] 15336

Making a column set, for identifying the original dataset.

test$set<-"Test"
train$set<-"Train"

Combining the data of train and test for a big dataset Full.

full<-rbind(train,test)
dim(full)

## [1] 489120     10

Converting full into tbl_df

full<-tbl_df(full)

Looking at full

names(full)

##  [1] "id"                    "week"                 
##  [3] "center_id"             "meal_id"              
##  [5] "checkout_price"        "base_price"           
##  [7] "emailer_for_promotion" "homepage_featured"    
##  [9] "num_orders"            "set"

class(full$num_orders)

## [1] "integer"

Checking for missing values.

missing_values<-summarise_all(full,funs(sum(is.na(.))/n()))
missing_values<-gather(missing_values,key="feature",value="MissingPoints")

Plotting the missing values.

g<-ggplot(data=missing_values,aes(x=reorder(feature,-MissingPoints),y=MissingPoints))
g<-g+geom_bar(stat="identity")+coord_flip()
g

Converting Emailer for promotion as a factor as it is only of two values.

full$emailer_for_promotion<-factor(full$emailer_for_promotion,levels = c(0,1))

Homepage features also

full$homepage_featured<-factor(full$homepage_featured,levels = c(0,1))

Only number of orders has missing values,6%. because of the test values.

names(full)

##  [1] "id"                    "week"                 
##  [3] "center_id"             "meal_id"              
##  [5] "checkout_price"        "base_price"           
##  [7] "emailer_for_promotion" "homepage_featured"    
##  [9] "num_orders"            "set"

Visualizing

Number of orders.

g<-ggplot(data=full)
g<-g+geom_point(aes(y=full$num_orders,x=full$id))
g

## Warning: Removed 32573 rows containing missing values (geom_point).

##Emailer for promotion and number of orders.

g<-ggplot(data=full)
g<-g+geom_bar(stat="identity",aes(x=full$emailer_for_promotion,y=full$num_orders,fill=full$emailer_for_promotion))
g<-g+xlab("Emailer For promotion")+ylab("Number of orders")+theme(legend.position = "none")
g

## Warning: Removed 32573 rows containing missing values (position_stack).

Homepage featured

g<-ggplot(data=full)
g<-g+geom_bar(stat="identity",aes(x=full$homepage_featured,y=full$num_orders))
g

## Warning: Removed 32573 rows containing missing values (position_stack).

Week and Number of orders.

g<-ggplot(data=train)
g<-g+geom_point(aes(x=train$week,y=train$num_orders,fill=train$week))
g<-g+xlab("Week")+ylab("Number of orders")+theme(legend.position="none")
g

There is no solid relation between Week and Number of orders.

Between Base price and Numbe or orders.

g<-ggplot(data=train)
g<-g+geom_point(aes(x=train$base_price,y=train$num_orders))
g

Between Checkout price and Number of orders.

g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$checkout_price,y=full$num_orders))
g

## Warning: Removed 32573 rows containing missing values (geom_point).

Between Meal_id and Number of orders.

g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$meal_id,y=full$num_orders))
g

## Warning: Removed 32573 rows containing missing values (geom_point).

Between Center id and number of orders.

g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$center_id,y=full$num_orders))
g

## Warning: Removed 32573 rows containing missing values (geom_point).

No relation

Cross validation

c<-sample.int(n=nrow(train),size=0.6*nrow(train),replace = F)
train_1<-train[c,]
test_1<-train[-c,]

Using Decision Trees.

dt<-rpart(num_orders~meal_id+checkout_price+base_price+emailer_for_promotion+homepage_featured,data=train_1)
rpart.plot(dt,fallen.leaves = F)

Using Random Forest

rf<-randomForest(num_orders~meal_id+checkout_price+base_price+emailer_for_promotion+homepage_featured,data=train_1,ntree=50)

Looking at the model.

summary(dt)

## Call:
## rpart(formula = num_orders ~ meal_id + checkout_price + base_price + 
##     emailer_for_promotion + homepage_featured, data = train_1)
##   n= 273928 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.08738626      0 1.0000000 1.0000039 0.01863718
## 2 0.05941021      1 0.9126137 0.9126451 0.01733136
## 3 0.04662786      2 0.8532035 0.8532955 0.01637541
## 4 0.03534945      3 0.8065757 0.8066687 0.01624136
## 5 0.02132553      4 0.7712262 0.7713713 0.01525086
## 6 0.01216105      6 0.7285752 0.7289274 0.01333873
## 7 0.01126135      7 0.7164141 0.7167985 0.01291864
## 8 0.01079270      9 0.6938914 0.6970486 0.01275121
## 9 0.01000000     14 0.6399279 0.6410112 0.01265038
## 
## Variable importance
##        checkout_price            base_price               meal_id 
##                    25                    24                    23 
##     homepage_featured emailer_for_promotion 
##                    17                    11 
## 
## Node number 1: 273928 observations,    complexity param=0.08738626
##   mean=261.6252, MSE=155258.3 
##   left son=2 (244050 obs) right son=3 (29878 obs)
##   Primary splits:
##       homepage_featured     < 0.5     to the left,  improve=0.08738626, (0 missing)
##       checkout_price        < 279.875 to the right, improve=0.07773662, (0 missing)
##       emailer_for_promotion < 0.5     to the left,  improve=0.07675609, (0 missing)
##       base_price            < 335.165 to the right, improve=0.06538558, (0 missing)
##       meal_id               < 1153.5  to the right, improve=0.01831642, (0 missing)
##   Surrogate splits:
##       emailer_for_promotion < 0.5     to the left,  agree=0.894, adj=0.026, (0 split)
## 
## Node number 2: 244050 observations,    complexity param=0.04662786
##   mean=220.8698, MSE=84590.39 
##   left son=4 (90149 obs) right son=5 (153901 obs)
##   Primary splits:
##       checkout_price        < 335.165 to the right, improve=0.09605872, (0 missing)
##       base_price            < 335.165 to the right, improve=0.09471307, (0 missing)
##       meal_id               < 1153.5  to the right, improve=0.03402886, (0 missing)
##       emailer_for_promotion < 0.5     to the left,  improve=0.02413694, (0 missing)
##   Surrogate splits:
##       base_price < 335.165 to the right, agree=0.979, adj=0.942, (0 split)
##       meal_id    < 2846.5  to the right, agree=0.656, adj=0.068, (0 split)
## 
## Node number 3: 29878 observations,    complexity param=0.05941021
##   mean=594.5243, MSE=608099.6 
##   left son=6 (14980 obs) right son=7 (14898 obs)
##   Primary splits:
##       checkout_price        < 274.055 to the right, improve=0.13906760, (0 missing)
##       base_price            < 336.15  to the right, improve=0.08808724, (0 missing)
##       emailer_for_promotion < 0.5     to the left,  improve=0.04917462, (0 missing)
##       meal_id               < 2297    to the right, improve=0.03159082, (0 missing)
##   Surrogate splits:
##       base_price            < 350.685 to the right, agree=0.899, adj=0.797, (0 split)
##       meal_id               < 1649    to the left,  agree=0.595, adj=0.187, (0 split)
##       emailer_for_promotion < 0.5     to the right, agree=0.554, adj=0.105, (0 split)
## 
## Node number 4: 90149 observations
##   mean=103.0904, MSE=16902.86 
## 
## Node number 5: 153901 observations,    complexity param=0.01216105
##   mean=289.8602, MSE=111353.7 
##   left son=10 (147774 obs) right son=11 (6127 obs)
##   Primary splits:
##       emailer_for_promotion < 0.5     to the left,  improve=0.03017978, (0 missing)
##       meal_id               < 1153.5  to the right, improve=0.02080238, (0 missing)
##       checkout_price        < 241.075 to the right, improve=0.01565182, (0 missing)
##       base_price            < 151.835 to the right, improve=0.01102966, (0 missing)
## 
## Node number 6: 14980 observations
##   mean=304.5173, MSE=106740.3 
## 
## Node number 7: 14898 observations,    complexity param=0.03534945
##   mean=886.1275, MSE=942619.1 
##   left son=14 (9982 obs) right son=15 (4916 obs)
##   Primary splits:
##       emailer_for_promotion < 0.5     to the left,  improve=0.10705570, (0 missing)
##       meal_id               < 2297    to the right, improve=0.06996197, (0 missing)
##       base_price            < 292.485 to the left,  improve=0.05129185, (0 missing)
##       checkout_price        < 119.825 to the left,  improve=0.02493541, (0 missing)
##   Surrogate splits:
##       base_price     < 292.485 to the left,  agree=0.735, adj=0.197, (0 split)
##       meal_id        < 1153.5  to the right, agree=0.690, adj=0.061, (0 split)
##       checkout_price < 265.325 to the left,  agree=0.677, adj=0.021, (0 split)
## 
## Node number 10: 147774 observations,    complexity param=0.0107927
##   mean=278.056, MSE=82629.92 
##   left son=20 (135701 obs) right son=21 (12073 obs)
##   Primary splits:
##       meal_id        < 1153.5  to the right, improve=0.02932473, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.01879387, (0 missing)
##       base_price     < 151.835 to the right, improve=0.01794499, (0 missing)
## 
## Node number 11: 6127 observations
##   mean=574.5588, MSE=719713.8 
## 
## Node number 14: 9982 observations,    complexity param=0.01126135
##   mean=663.1967, MSE=447604.9 
##   left son=28 (3856 obs) right son=29 (6126 obs)
##   Primary splits:
##       meal_id        < 2298    to the right, improve=0.09244895, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.07078568, (0 missing)
##       base_price     < 151.835 to the right, improve=0.06617691, (0 missing)
##   Surrogate splits:
##       base_price     < 216.795 to the right, agree=0.656, adj=0.110, (0 split)
##       checkout_price < 192.575 to the right, agree=0.645, adj=0.081, (0 split)
## 
## Node number 15: 4916 observations,    complexity param=0.02132553
##   mean=1338.791, MSE=1641934 
##   left son=30 (1868 obs) right son=31 (3048 obs)
##   Primary splits:
##       meal_id        < 2297    to the right, improve=0.07878209, (0 missing)
##       base_price     < 292.485 to the left,  improve=0.03615819, (0 missing)
##       checkout_price < 119.795 to the left,  improve=0.03465867, (0 missing)
##   Surrogate splits:
##       base_price < 256.595 to the left,  agree=0.648, adj=0.073, (0 split)
## 
## Node number 20: 135701 observations,    complexity param=0.0107927
##   mean=263.3735, MSE=80019.61 
##   left son=40 (23537 obs) right son=41 (112164 obs)
##   Primary splits:
##       meal_id        < 1635    to the left,  improve=0.02802507, (0 missing)
##       base_price     < 151.835 to the right, improve=0.02638343, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.01853121, (0 missing)
##   Surrogate splits:
##       base_price     < 457.4   to the right, agree=0.827, adj=0.003, (0 split)
##       checkout_price < 80.555  to the left,  agree=0.827, adj=0.001, (0 split)
## 
## Node number 21: 12073 observations
##   mean=443.0885, MSE=82311.11 
## 
## Node number 28: 3856 observations
##   mean=406.7964, MSE=167184.7 
## 
## Node number 29: 6126 observations,    complexity param=0.01126135
##   mean=824.5873, MSE=556687.4 
##   left son=58 (1641 obs) right son=59 (4485 obs)
##   Primary splits:
##       meal_id        < 1635    to the left,  improve=0.15975880, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.05329010, (0 missing)
##       base_price     < 151.835 to the right, improve=0.04718472, (0 missing)
##   Surrogate splits:
##       checkout_price < 241.075 to the right, agree=0.801, adj=0.255, (0 split)
##       base_price     < 185.315 to the right, agree=0.781, adj=0.181, (0 split)
## 
## Node number 30: 1868 observations
##   mean=879.3704, MSE=593239.5 
## 
## Node number 31: 3048 observations,    complexity param=0.02132553
##   mean=1620.352, MSE=2076007 
##   left son=62 (2732 obs) right son=63 (316 obs)
##   Primary splits:
##       meal_id        < 2141.5  to the left,  improve=0.18617010, (0 missing)
##       base_price     < 292.485 to the left,  improve=0.05194058, (0 missing)
##       checkout_price < 119.795 to the left,  improve=0.04417040, (0 missing)
## 
## Node number 40: 23537 observations
##   mean=159.9967, MSE=26910.8 
## 
## Node number 41: 112164 observations,    complexity param=0.0107927
##   mean=285.0665, MSE=88451.05 
##   left son=82 (64359 obs) right son=83 (47805 obs)
##   Primary splits:
##       meal_id        < 2297    to the right, improve=0.03282895, (0 missing)
##       base_price     < 151.835 to the right, improve=0.02887008, (0 missing)
##       checkout_price < 193.545 to the right, improve=0.02580156, (0 missing)
##   Surrogate splits:
##       base_price     < 214.885 to the right, agree=0.633, adj=0.139, (0 split)
##       checkout_price < 185.785 to the right, agree=0.631, adj=0.135, (0 split)
## 
## Node number 58: 1641 observations
##   mean=331.5667, MSE=117781 
## 
## Node number 59: 4485 observations
##   mean=1004.977, MSE=595801.1 
## 
## Node number 62: 2732 observations
##   mean=1408.919, MSE=1334063 
## 
## Node number 63: 316 observations
##   mean=3448.31, MSE=4762616 
## 
## Node number 82: 64359 observations
##   mean=238.6244, MSE=58761.79 
## 
## Node number 83: 47805 observations,    complexity param=0.0107927
##   mean=347.5907, MSE=121608.1 
##   left son=166 (41889 obs) right son=167 (5916 obs)
##   Primary splits:
##       meal_id        < 2214.5  to the left,  improve=0.10749530, (0 missing)
##       base_price     < 153.275 to the right, improve=0.07608897, (0 missing)
##       checkout_price < 153.275 to the right, improve=0.07152240, (0 missing)
## 
## Node number 166: 41889 observations,    complexity param=0.0107927
##   mean=304.6232, MSE=104994.7 
##   left son=332 (32541 obs) right son=333 (9348 obs)
##   Primary splits:
##       base_price     < 153.275 to the right, improve=0.15507530, (0 missing)
##       checkout_price < 153.275 to the right, improve=0.15081430, (0 missing)
##       meal_id        < 2066    to the right, improve=0.07975996, (0 missing)
##   Surrogate splits:
##       checkout_price < 152.805 to the right, agree=0.98, adj=0.909, (0 split)
## 
## Node number 167: 5916 observations
##   mean=651.8276, MSE=133609.6 
## 
## Node number 332: 32541 observations
##   mean=236.2322, MSE=54585.77 
## 
## Node number 333: 9348 observations
##   mean=542.6966, MSE=207510.3

Predicting for train1 using DT.

train_1$pred<-rpart.predict(dt,data=train_1,type="vector")
head(select(train_1,c(pred,num_orders)))

##            pred num_orders
## 145464 238.6244        136
## 97364  103.0904        175
## 47312  574.5588        190
## 91432  103.0904         28
## 335394 238.6244        486
## 275856 304.5173         81

Predicting on Train1 using RF.

train_1$predRF<-predict(rf,data=train_1)
head(select(train_1,c(predRF,num_orders)))

##          predRF num_orders
## 145464 355.8383        136
## 97364  122.1820        175
## 47312  342.4670        190
## 91432  121.7840         28
## 335394 259.3124        486
## 275856 278.1963         81

Predeicting for test_1 Using DT.

test_1$pred<-predict(dt,newdata = test_1)

Predicting for test_1 using RF.

test_1$predRF<-predict(rf,newdata = test_1)

Counting Residuals DT.

e<-test_1$num_orders-test_1$pred
length(e)

## [1] 182619

e<-data.frame(row=1:182619,resid=e)
summary(e$resid)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -3069.310  -129.232   -56.697    -0.843    58.910 13224.023

Counting Residuals RF.

rfE<-test_1$num_orders-test_1$predRF
rfE<-data.frame(row=1:182619,resid=rfE)
summary(rfE)

##       row             resid          
##  Min.   :     1   Min.   :-1185.807  
##  1st Qu.: 45656   1st Qu.: -138.481  
##  Median : 91310   Median :  -69.047  
##  Mean   : 91310   Mean   :   -0.421  
##  3rd Qu.:136965   3rd Qu.:   44.909  
##  Max.   :182619   Max.   :13235.620

Comparing both residual.

par(mfrow=c(1,2))

Plotting residuals DT.

boxplot(e$resid)

Plotting Residuals Rf.

boxplot(rfE$resid)

Predicting on test data using DT.

submission<-predict(dt,newdata = test)
sub_df<-data.frame(id=test$id,num_orders=submission)
head(sub_df)

##        id num_orders
## 1 1028232   236.2322
## 2 1127204   236.2322
## 3 1212707   238.6244
## 4 1082698   238.6244
## 5 1400926   159.9967
## 6 1284113   236.2322

Predicting on test using RF.

sub<-predict(rf,newdata = test)
sub_df_1<-data.frame(id=test$id,num_orders=sub)

Making a csv file.

write.csv(sub_df,"submission2.csv",row.names = F)
write.csv(sub_df_1,"submission3.csv",row.names = F)

Food Demand Forecasting Challenge

Shashwat Khare

March 19, 2019

Loading the files into R.

Loading the libraires.

Looking at the files.

Train

Removing the outlier form train that we have found later.

Making a column set, for identifying the original dataset.

Combining the data of train and test for a big dataset Full.

Converting full into tbl_df

Looking at full

Checking for missing values.

Plotting the missing values.

Converting Emailer for promotion as a factor as it is only of two values.

Homepage features also

Only number of orders has missing values,6%. because of the test values.

Visualizing

Number of orders.

Homepage featured

Week and Number of orders.

There is no solid relation between Week and Number of orders.

Between Base price and Numbe or orders.

Between Checkout price and Number of orders.

Between Meal_id and Number of orders.

Between Center id and number of orders.

Cross validation

Using Decision Trees.

Using Random Forest

Looking at the model.

Predicting for train1 using DT.

Predicting on Train1 using RF.

Predeicting for test_1 Using DT.

Predicting for test_1 using RF.

Counting Residuals DT.

Counting Residuals RF.

Comparing both residual.

Plotting residuals DT.

Plotting Residuals Rf.

Predicting on test data using DT.

Predicting on test using RF.

Making a csv file.