setwd("~/Downloads/Prof Sameer Mathur/Restaurant data")
train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)
str(train)
## 'data.frame': 137 obs. of 43 variables:
## $ Id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Open.Date : Factor w/ 134 levels "01/03/2014","01/07/2000",..: 60 17 21 11 42 16 94 53 78 114 ...
## $ City : Factor w/ 34 levels "Adana","Afyonkarahisar",..: 17 4 11 31 15 4 17 17 2 12 ...
## $ City.Group: Factor w/ 2 levels "Big Cities","Other": 1 1 2 2 2 1 1 1 2 2 ...
## $ Type : Factor w/ 3 levels "DT","FC","IL": 3 2 3 3 3 2 3 3 3 3 ...
## $ P1 : int 4 4 2 6 3 6 2 4 1 6 ...
## $ P2 : num 5 5 4 4.5 4 6 3 5 1 4.5 ...
## $ P3 : num 4 4 2 6 3 4.5 4 4 4 6 ...
## $ P4 : num 4 4 5 6 4 7.5 4 5 4 7.5 ...
## $ P5 : int 2 1 2 4 2 8 1 2 1 6 ...
## $ P6 : int 2 2 3 4 2 10 5 3 2 4 ...
## $ P7 : int 5 5 5 10 5 10 5 5 1 10 ...
## $ P8 : int 4 5 5 8 5 8 5 4 5 10 ...
## $ P9 : int 5 5 5 10 5 8 5 4 5 10 ...
## $ P10 : int 5 5 5 10 5 8 5 4 5 10 ...
## $ P11 : int 3 1 2 8 2 10 2 4 1 2 ...
## $ P12 : int 5 5 5 10 5 8 5 3 5 10 ...
## $ P13 : num 5 5 5 7.5 5 6 5 4 5 7.5 ...
## $ P14 : int 1 0 0 6 2 0 3 0 1 0 ...
## $ P15 : int 2 0 0 4 1 0 4 0 1 0 ...
## $ P16 : int 2 0 0 9 2 0 4 0 2 0 ...
## $ P17 : int 2 0 0 3 1 0 3 0 1 0 ...
## $ P18 : int 4 0 0 12 4 0 4 0 4 0 ...
## $ P19 : int 5 3 1 20 2 5 2 3 1 25 ...
## $ P20 : int 4 2 1 12 2 6 4 5 1 3 ...
## $ P21 : int 1 1 1 6 1 3 1 2 1 3 ...
## $ P22 : int 3 3 1 1 2 1 2 4 1 1 ...
## $ P23 : int 3 2 1 10 1 5 1 2 1 10 ...
## $ P24 : int 1 0 0 2 2 0 5 0 4 0 ...
## $ P25 : int 1 0 0 2 3 0 4 0 4 0 ...
## $ P26 : num 1 0 0 2.5 3 0 4 0 4 0 ...
## $ P27 : num 4 0 0 2.5 5 0 5 0 2 0 ...
## $ P28 : num 2 3 1 2.5 1 7.5 1 3 2 5 ...
## $ P29 : num 3 3 3 7.5 3 5 3 2 3 2.5 ...
## $ P30 : int 5 0 0 25 5 0 4 0 4 0 ...
## $ P31 : int 3 0 0 12 1 0 5 0 5 0 ...
## $ P32 : int 4 0 0 10 3 0 2 0 5 0 ...
## $ P33 : int 5 0 0 6 2 0 2 0 3 0 ...
## $ P34 : int 5 0 0 18 3 0 3 0 4 0 ...
## $ P35 : int 4 0 0 12 4 0 5 0 5 0 ...
## $ P36 : int 3 0 0 12 3 0 4 0 4 0 ...
## $ P37 : int 4 0 0 6 3 0 4 0 5 0 ...
## $ revenue : num 5653753 6923131 2055379 2675511 4316715 ...
library(psych)
summary(train)
## Id Open.Date City City.Group Type
## Min. : 0 01/07/2000: 2 Istanbul:50 Big Cities:78 DT: 1
## 1st Qu.: 34 02/02/2012: 2 Ankara :19 Other :59 FC:76
## Median : 68 02/23/2010: 2 Izmir : 9 IL:60
## Mean : 68 01/03/2014: 1 Bursa : 5
## 3rd Qu.:102 01/09/2010: 1 Samsun : 5
## Max. :136 01/17/2009: 1 Antalya : 4
## (Other) :128 (Other) :45
## P1 P2 P3 P4
## Min. : 1.000 Min. :1.000 Min. :0.000 Min. :3.000
## 1st Qu.: 2.000 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000
## Median : 3.000 Median :5.000 Median :4.000 Median :4.000
## Mean : 4.015 Mean :4.409 Mean :4.318 Mean :4.372
## 3rd Qu.: 4.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :12.000 Max. :7.500 Max. :7.500 Max. :7.500
##
## P5 P6 P7 P8
## Min. :1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.: 2.000 1st Qu.: 5.000 1st Qu.: 4.000
## Median :2.000 Median : 3.000 Median : 5.000 Median : 5.000
## Mean :2.007 Mean : 3.358 Mean : 5.423 Mean : 5.153
## 3rd Qu.:2.000 3rd Qu.: 4.000 3rd Qu.: 5.000 3rd Qu.: 5.000
## Max. :8.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## P9 P10 P11 P12
## Min. : 4.000 Min. : 4.000 Min. : 1.000 Min. : 2.000
## 1st Qu.: 4.000 1st Qu.: 5.000 1st Qu.: 2.000 1st Qu.: 4.000
## Median : 5.000 Median : 5.000 Median : 3.000 Median : 5.000
## Mean : 5.445 Mean : 5.489 Mean : 3.263 Mean : 5.299
## 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000 3rd Qu.: 5.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## P13 P14 P15 P16
## Min. :3.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:5.00 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median :5.00 Median : 0.000 Median : 0.000 Median : 0.000
## Mean :5.08 Mean : 1.416 Mean : 1.387 Mean : 1.942
## 3rd Qu.:5.00 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 3.000
## Max. :7.50 Max. :15.000 Max. :10.000 Max. :15.000
##
## P17 P18 P19 P20
## Min. : 0.000 Min. : 0.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 2.000 1st Qu.: 2.000
## Median : 0.000 Median : 0.000 Median : 3.000 Median : 4.000
## Mean : 1.036 Mean : 1.942 Mean : 4.905 Mean : 4.547
## 3rd Qu.: 1.000 3rd Qu.: 4.000 3rd Qu.: 5.000 3rd Qu.: 5.000
## Max. :15.000 Max. :12.000 Max. :25.000 Max. :15.000
##
## P21 P22 P23 P24
## Min. : 1.00 Min. :1.000 Min. : 1.000 Min. : 0.000
## 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 0.000
## Median : 1.00 Median :2.000 Median : 2.000 Median : 0.000
## Mean : 2.27 Mean :2.226 Mean : 3.423 Mean : 1.372
## 3rd Qu.: 3.00 3rd Qu.:3.000 3rd Qu.: 5.000 3rd Qu.: 2.000
## Max. :15.00 Max. :5.000 Max. :25.000 Max. :10.000
##
## P25 P26 P27 P28
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 1.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 0.000 Median : 0.000 Median : 0.000 Median : 2.500
## Mean : 1.212 Mean : 1.471 Mean : 1.146 Mean : 3.223
## 3rd Qu.: 2.000 3rd Qu.: 2.500 3rd Qu.: 2.000 3rd Qu.: 4.000
## Max. :10.000 Max. :12.500 Max. :12.500 Max. :12.500
##
## P29 P30 P31 P32
## Min. :0.000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.500 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median :3.000 Median : 0.00 Median : 0.000 Median : 0.000
## Mean :3.135 Mean : 2.73 Mean : 1.942 Mean : 2.526
## 3rd Qu.:3.000 3rd Qu.: 4.00 3rd Qu.: 3.000 3rd Qu.: 3.000
## Max. :7.500 Max. :25.00 Max. :15.000 Max. :25.000
##
## P33 P34 P35 P36
## Min. :0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median :0.000 Median : 0.000 Median : 0.000 Median : 0.000
## Mean :1.139 Mean : 2.489 Mean : 2.029 Mean : 2.212
## 3rd Qu.:2.000 3rd Qu.: 3.000 3rd Qu.: 4.000 3rd Qu.: 3.000
## Max. :6.000 Max. :24.000 Max. :15.000 Max. :20.000
##
## P37 revenue
## Min. :0.000 Min. : 1149870
## 1st Qu.:0.000 1st Qu.: 2999068
## Median :0.000 Median : 3939804
## Mean :1.117 Mean : 4453533
## 3rd Qu.:2.000 3rd Qu.: 5166635
## Max. :8.000 Max. :19696939
##
attach(train)
xtabs(~City.Group+Type)
## Type
## City.Group DT FC IL
## Big Cities 1 39 38
## Other 0 37 22
xtabs(~test$City.Group+test$Type)
## test$Type
## test$City.Group DT FC IL MB
## Big Cities 1122 27319 20717 114
## Other 1122 29700 19730 176
mean(revenue)
## [1] 4453533
train[which(revenue==max(revenue)),c(1,3,43)]
## Id City revenue
## 17 16 Istanbul 19696939
train[which(revenue==min(revenue)),c(1,3,43)]
## Id City revenue
## 22 21 Istanbul 1149870
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
qplot(x = revenue, y = City, data = train, color = City.Group, size = Type)
## Warning: Using size for a discrete variable is not advised.
library(corrgram)
corrgram(train[,-c(1,2,3)], order=TRUE, lower.panel=panel.shade,upper.panel=panel.cor,text.panel=panel.txt,main="TFI revenue prediction")
We can see from the corrgram that only the variable P2 and P28 have somewhat of a positive effect on the revenue as seen by a slighlty darker shade of blue.
This dataset has variables P6 to P37 which are not made clear by the dataset providers. However, it is clear that to carry out the process further feature engineering and a deep knowledge clustering techniques and the library random forest is required.