train<-read.csv("F:/Data science/practice/Finished/Food Demand Forecasting Challage/train.csv")
test<-read.csv("F:/Data science/practice/Finished/Food Demand Forecasting Challage/test.csv")
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
dim(test)
## [1] 32573 8
dim(train)
## [1] 456548 9
names(test)
## [1] "id" "week" "center_id"
## [4] "meal_id" "checkout_price" "base_price"
## [7] "emailer_for_promotion" "homepage_featured"
names(train)
## [1] "id" "week" "center_id"
## [4] "meal_id" "checkout_price" "base_price"
## [7] "emailer_for_promotion" "homepage_featured" "num_orders"
test$num_orders<-NA
names(test)
## [1] "id" "week" "center_id"
## [4] "meal_id" "checkout_price" "base_price"
## [7] "emailer_for_promotion" "homepage_featured" "num_orders"
max(train$num_orders)
## [1] 24299
train<-filter(train,train$num_orders!=24299)
max(train$num_orders)
## [1] 15336
test$set<-"Test"
train$set<-"Train"
full<-rbind(train,test)
dim(full)
## [1] 489120 10
full<-tbl_df(full)
names(full)
## [1] "id" "week"
## [3] "center_id" "meal_id"
## [5] "checkout_price" "base_price"
## [7] "emailer_for_promotion" "homepage_featured"
## [9] "num_orders" "set"
class(full$num_orders)
## [1] "integer"
missing_values<-summarise_all(full,funs(sum(is.na(.))/n()))
missing_values<-gather(missing_values,key="feature",value="MissingPoints")
g<-ggplot(data=missing_values,aes(x=reorder(feature,-MissingPoints),y=MissingPoints))
g<-g+geom_bar(stat="identity")+coord_flip()
g
full$emailer_for_promotion<-factor(full$emailer_for_promotion,levels = c(0,1))
full$homepage_featured<-factor(full$homepage_featured,levels = c(0,1))
names(full)
## [1] "id" "week"
## [3] "center_id" "meal_id"
## [5] "checkout_price" "base_price"
## [7] "emailer_for_promotion" "homepage_featured"
## [9] "num_orders" "set"
g<-ggplot(data=full)
g<-g+geom_point(aes(y=full$num_orders,x=full$id))
g
## Warning: Removed 32573 rows containing missing values (geom_point).
##Emailer for promotion and number of orders.
g<-ggplot(data=full)
g<-g+geom_bar(stat="identity",aes(x=full$emailer_for_promotion,y=full$num_orders,fill=full$emailer_for_promotion))
g<-g+xlab("Emailer For promotion")+ylab("Number of orders")+theme(legend.position = "none")
g
## Warning: Removed 32573 rows containing missing values (position_stack).
g<-ggplot(data=full)
g<-g+geom_bar(stat="identity",aes(x=full$homepage_featured,y=full$num_orders))
g
## Warning: Removed 32573 rows containing missing values (position_stack).
g<-ggplot(data=train)
g<-g+geom_point(aes(x=train$week,y=train$num_orders,fill=train$week))
g<-g+xlab("Week")+ylab("Number of orders")+theme(legend.position="none")
g
g<-ggplot(data=train)
g<-g+geom_point(aes(x=train$base_price,y=train$num_orders))
g
g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$checkout_price,y=full$num_orders))
g
## Warning: Removed 32573 rows containing missing values (geom_point).
g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$meal_id,y=full$num_orders))
g
## Warning: Removed 32573 rows containing missing values (geom_point).
g<-ggplot(data=full)
g<-g+geom_point(aes(x=full$center_id,y=full$num_orders))
g
## Warning: Removed 32573 rows containing missing values (geom_point).
No relation
c<-sample.int(n=nrow(train),size=0.6*nrow(train),replace = F)
train_1<-train[c,]
test_1<-train[-c,]
dt<-rpart(num_orders~meal_id+checkout_price+base_price+emailer_for_promotion+homepage_featured,data=train_1)
rpart.plot(dt,fallen.leaves = F)
rf<-randomForest(num_orders~meal_id+checkout_price+base_price+emailer_for_promotion+homepage_featured,data=train_1,ntree=50)
summary(dt)
## Call:
## rpart(formula = num_orders ~ meal_id + checkout_price + base_price +
## emailer_for_promotion + homepage_featured, data = train_1)
## n= 273928
##
## CP nsplit rel error xerror xstd
## 1 0.08738626 0 1.0000000 1.0000039 0.01863718
## 2 0.05941021 1 0.9126137 0.9126451 0.01733136
## 3 0.04662786 2 0.8532035 0.8532955 0.01637541
## 4 0.03534945 3 0.8065757 0.8066687 0.01624136
## 5 0.02132553 4 0.7712262 0.7713713 0.01525086
## 6 0.01216105 6 0.7285752 0.7289274 0.01333873
## 7 0.01126135 7 0.7164141 0.7167985 0.01291864
## 8 0.01079270 9 0.6938914 0.6970486 0.01275121
## 9 0.01000000 14 0.6399279 0.6410112 0.01265038
##
## Variable importance
## checkout_price base_price meal_id
## 25 24 23
## homepage_featured emailer_for_promotion
## 17 11
##
## Node number 1: 273928 observations, complexity param=0.08738626
## mean=261.6252, MSE=155258.3
## left son=2 (244050 obs) right son=3 (29878 obs)
## Primary splits:
## homepage_featured < 0.5 to the left, improve=0.08738626, (0 missing)
## checkout_price < 279.875 to the right, improve=0.07773662, (0 missing)
## emailer_for_promotion < 0.5 to the left, improve=0.07675609, (0 missing)
## base_price < 335.165 to the right, improve=0.06538558, (0 missing)
## meal_id < 1153.5 to the right, improve=0.01831642, (0 missing)
## Surrogate splits:
## emailer_for_promotion < 0.5 to the left, agree=0.894, adj=0.026, (0 split)
##
## Node number 2: 244050 observations, complexity param=0.04662786
## mean=220.8698, MSE=84590.39
## left son=4 (90149 obs) right son=5 (153901 obs)
## Primary splits:
## checkout_price < 335.165 to the right, improve=0.09605872, (0 missing)
## base_price < 335.165 to the right, improve=0.09471307, (0 missing)
## meal_id < 1153.5 to the right, improve=0.03402886, (0 missing)
## emailer_for_promotion < 0.5 to the left, improve=0.02413694, (0 missing)
## Surrogate splits:
## base_price < 335.165 to the right, agree=0.979, adj=0.942, (0 split)
## meal_id < 2846.5 to the right, agree=0.656, adj=0.068, (0 split)
##
## Node number 3: 29878 observations, complexity param=0.05941021
## mean=594.5243, MSE=608099.6
## left son=6 (14980 obs) right son=7 (14898 obs)
## Primary splits:
## checkout_price < 274.055 to the right, improve=0.13906760, (0 missing)
## base_price < 336.15 to the right, improve=0.08808724, (0 missing)
## emailer_for_promotion < 0.5 to the left, improve=0.04917462, (0 missing)
## meal_id < 2297 to the right, improve=0.03159082, (0 missing)
## Surrogate splits:
## base_price < 350.685 to the right, agree=0.899, adj=0.797, (0 split)
## meal_id < 1649 to the left, agree=0.595, adj=0.187, (0 split)
## emailer_for_promotion < 0.5 to the right, agree=0.554, adj=0.105, (0 split)
##
## Node number 4: 90149 observations
## mean=103.0904, MSE=16902.86
##
## Node number 5: 153901 observations, complexity param=0.01216105
## mean=289.8602, MSE=111353.7
## left son=10 (147774 obs) right son=11 (6127 obs)
## Primary splits:
## emailer_for_promotion < 0.5 to the left, improve=0.03017978, (0 missing)
## meal_id < 1153.5 to the right, improve=0.02080238, (0 missing)
## checkout_price < 241.075 to the right, improve=0.01565182, (0 missing)
## base_price < 151.835 to the right, improve=0.01102966, (0 missing)
##
## Node number 6: 14980 observations
## mean=304.5173, MSE=106740.3
##
## Node number 7: 14898 observations, complexity param=0.03534945
## mean=886.1275, MSE=942619.1
## left son=14 (9982 obs) right son=15 (4916 obs)
## Primary splits:
## emailer_for_promotion < 0.5 to the left, improve=0.10705570, (0 missing)
## meal_id < 2297 to the right, improve=0.06996197, (0 missing)
## base_price < 292.485 to the left, improve=0.05129185, (0 missing)
## checkout_price < 119.825 to the left, improve=0.02493541, (0 missing)
## Surrogate splits:
## base_price < 292.485 to the left, agree=0.735, adj=0.197, (0 split)
## meal_id < 1153.5 to the right, agree=0.690, adj=0.061, (0 split)
## checkout_price < 265.325 to the left, agree=0.677, adj=0.021, (0 split)
##
## Node number 10: 147774 observations, complexity param=0.0107927
## mean=278.056, MSE=82629.92
## left son=20 (135701 obs) right son=21 (12073 obs)
## Primary splits:
## meal_id < 1153.5 to the right, improve=0.02932473, (0 missing)
## checkout_price < 193.545 to the right, improve=0.01879387, (0 missing)
## base_price < 151.835 to the right, improve=0.01794499, (0 missing)
##
## Node number 11: 6127 observations
## mean=574.5588, MSE=719713.8
##
## Node number 14: 9982 observations, complexity param=0.01126135
## mean=663.1967, MSE=447604.9
## left son=28 (3856 obs) right son=29 (6126 obs)
## Primary splits:
## meal_id < 2298 to the right, improve=0.09244895, (0 missing)
## checkout_price < 193.545 to the right, improve=0.07078568, (0 missing)
## base_price < 151.835 to the right, improve=0.06617691, (0 missing)
## Surrogate splits:
## base_price < 216.795 to the right, agree=0.656, adj=0.110, (0 split)
## checkout_price < 192.575 to the right, agree=0.645, adj=0.081, (0 split)
##
## Node number 15: 4916 observations, complexity param=0.02132553
## mean=1338.791, MSE=1641934
## left son=30 (1868 obs) right son=31 (3048 obs)
## Primary splits:
## meal_id < 2297 to the right, improve=0.07878209, (0 missing)
## base_price < 292.485 to the left, improve=0.03615819, (0 missing)
## checkout_price < 119.795 to the left, improve=0.03465867, (0 missing)
## Surrogate splits:
## base_price < 256.595 to the left, agree=0.648, adj=0.073, (0 split)
##
## Node number 20: 135701 observations, complexity param=0.0107927
## mean=263.3735, MSE=80019.61
## left son=40 (23537 obs) right son=41 (112164 obs)
## Primary splits:
## meal_id < 1635 to the left, improve=0.02802507, (0 missing)
## base_price < 151.835 to the right, improve=0.02638343, (0 missing)
## checkout_price < 193.545 to the right, improve=0.01853121, (0 missing)
## Surrogate splits:
## base_price < 457.4 to the right, agree=0.827, adj=0.003, (0 split)
## checkout_price < 80.555 to the left, agree=0.827, adj=0.001, (0 split)
##
## Node number 21: 12073 observations
## mean=443.0885, MSE=82311.11
##
## Node number 28: 3856 observations
## mean=406.7964, MSE=167184.7
##
## Node number 29: 6126 observations, complexity param=0.01126135
## mean=824.5873, MSE=556687.4
## left son=58 (1641 obs) right son=59 (4485 obs)
## Primary splits:
## meal_id < 1635 to the left, improve=0.15975880, (0 missing)
## checkout_price < 193.545 to the right, improve=0.05329010, (0 missing)
## base_price < 151.835 to the right, improve=0.04718472, (0 missing)
## Surrogate splits:
## checkout_price < 241.075 to the right, agree=0.801, adj=0.255, (0 split)
## base_price < 185.315 to the right, agree=0.781, adj=0.181, (0 split)
##
## Node number 30: 1868 observations
## mean=879.3704, MSE=593239.5
##
## Node number 31: 3048 observations, complexity param=0.02132553
## mean=1620.352, MSE=2076007
## left son=62 (2732 obs) right son=63 (316 obs)
## Primary splits:
## meal_id < 2141.5 to the left, improve=0.18617010, (0 missing)
## base_price < 292.485 to the left, improve=0.05194058, (0 missing)
## checkout_price < 119.795 to the left, improve=0.04417040, (0 missing)
##
## Node number 40: 23537 observations
## mean=159.9967, MSE=26910.8
##
## Node number 41: 112164 observations, complexity param=0.0107927
## mean=285.0665, MSE=88451.05
## left son=82 (64359 obs) right son=83 (47805 obs)
## Primary splits:
## meal_id < 2297 to the right, improve=0.03282895, (0 missing)
## base_price < 151.835 to the right, improve=0.02887008, (0 missing)
## checkout_price < 193.545 to the right, improve=0.02580156, (0 missing)
## Surrogate splits:
## base_price < 214.885 to the right, agree=0.633, adj=0.139, (0 split)
## checkout_price < 185.785 to the right, agree=0.631, adj=0.135, (0 split)
##
## Node number 58: 1641 observations
## mean=331.5667, MSE=117781
##
## Node number 59: 4485 observations
## mean=1004.977, MSE=595801.1
##
## Node number 62: 2732 observations
## mean=1408.919, MSE=1334063
##
## Node number 63: 316 observations
## mean=3448.31, MSE=4762616
##
## Node number 82: 64359 observations
## mean=238.6244, MSE=58761.79
##
## Node number 83: 47805 observations, complexity param=0.0107927
## mean=347.5907, MSE=121608.1
## left son=166 (41889 obs) right son=167 (5916 obs)
## Primary splits:
## meal_id < 2214.5 to the left, improve=0.10749530, (0 missing)
## base_price < 153.275 to the right, improve=0.07608897, (0 missing)
## checkout_price < 153.275 to the right, improve=0.07152240, (0 missing)
##
## Node number 166: 41889 observations, complexity param=0.0107927
## mean=304.6232, MSE=104994.7
## left son=332 (32541 obs) right son=333 (9348 obs)
## Primary splits:
## base_price < 153.275 to the right, improve=0.15507530, (0 missing)
## checkout_price < 153.275 to the right, improve=0.15081430, (0 missing)
## meal_id < 2066 to the right, improve=0.07975996, (0 missing)
## Surrogate splits:
## checkout_price < 152.805 to the right, agree=0.98, adj=0.909, (0 split)
##
## Node number 167: 5916 observations
## mean=651.8276, MSE=133609.6
##
## Node number 332: 32541 observations
## mean=236.2322, MSE=54585.77
##
## Node number 333: 9348 observations
## mean=542.6966, MSE=207510.3
train_1$pred<-rpart.predict(dt,data=train_1,type="vector")
head(select(train_1,c(pred,num_orders)))
## pred num_orders
## 145464 238.6244 136
## 97364 103.0904 175
## 47312 574.5588 190
## 91432 103.0904 28
## 335394 238.6244 486
## 275856 304.5173 81
train_1$predRF<-predict(rf,data=train_1)
head(select(train_1,c(predRF,num_orders)))
## predRF num_orders
## 145464 355.8383 136
## 97364 122.1820 175
## 47312 342.4670 190
## 91432 121.7840 28
## 335394 259.3124 486
## 275856 278.1963 81
test_1$pred<-predict(dt,newdata = test_1)
test_1$predRF<-predict(rf,newdata = test_1)
e<-test_1$num_orders-test_1$pred
length(e)
## [1] 182619
e<-data.frame(row=1:182619,resid=e)
summary(e$resid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3069.310 -129.232 -56.697 -0.843 58.910 13224.023
rfE<-test_1$num_orders-test_1$predRF
rfE<-data.frame(row=1:182619,resid=rfE)
summary(rfE)
## row resid
## Min. : 1 Min. :-1185.807
## 1st Qu.: 45656 1st Qu.: -138.481
## Median : 91310 Median : -69.047
## Mean : 91310 Mean : -0.421
## 3rd Qu.:136965 3rd Qu.: 44.909
## Max. :182619 Max. :13235.620
par(mfrow=c(1,2))
boxplot(e$resid)
boxplot(rfE$resid)
submission<-predict(dt,newdata = test)
sub_df<-data.frame(id=test$id,num_orders=submission)
head(sub_df)
## id num_orders
## 1 1028232 236.2322
## 2 1127204 236.2322
## 3 1212707 238.6244
## 4 1082698 238.6244
## 5 1400926 159.9967
## 6 1284113 236.2322
sub<-predict(rf,newdata = test)
sub_df_1<-data.frame(id=test$id,num_orders=sub)
write.csv(sub_df,"submission2.csv",row.names = F)
write.csv(sub_df_1,"submission3.csv",row.names = F)