Read files

setwd("~/Downloads/Prof Sameer Mathur/Restaurant data")
train <- read.csv("train.csv", header = TRUE)
test  <- read.csv("test.csv", header = TRUE)
str(train)
## 'data.frame':    137 obs. of  43 variables:
##  $ Id        : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Open.Date : Factor w/ 134 levels "01/03/2014","01/07/2000",..: 60 17 21 11 42 16 94 53 78 114 ...
##  $ City      : Factor w/ 34 levels "Adana","Afyonkarahisar",..: 17 4 11 31 15 4 17 17 2 12 ...
##  $ City.Group: Factor w/ 2 levels "Big Cities","Other": 1 1 2 2 2 1 1 1 2 2 ...
##  $ Type      : Factor w/ 3 levels "DT","FC","IL": 3 2 3 3 3 2 3 3 3 3 ...
##  $ P1        : int  4 4 2 6 3 6 2 4 1 6 ...
##  $ P2        : num  5 5 4 4.5 4 6 3 5 1 4.5 ...
##  $ P3        : num  4 4 2 6 3 4.5 4 4 4 6 ...
##  $ P4        : num  4 4 5 6 4 7.5 4 5 4 7.5 ...
##  $ P5        : int  2 1 2 4 2 8 1 2 1 6 ...
##  $ P6        : int  2 2 3 4 2 10 5 3 2 4 ...
##  $ P7        : int  5 5 5 10 5 10 5 5 1 10 ...
##  $ P8        : int  4 5 5 8 5 8 5 4 5 10 ...
##  $ P9        : int  5 5 5 10 5 8 5 4 5 10 ...
##  $ P10       : int  5 5 5 10 5 8 5 4 5 10 ...
##  $ P11       : int  3 1 2 8 2 10 2 4 1 2 ...
##  $ P12       : int  5 5 5 10 5 8 5 3 5 10 ...
##  $ P13       : num  5 5 5 7.5 5 6 5 4 5 7.5 ...
##  $ P14       : int  1 0 0 6 2 0 3 0 1 0 ...
##  $ P15       : int  2 0 0 4 1 0 4 0 1 0 ...
##  $ P16       : int  2 0 0 9 2 0 4 0 2 0 ...
##  $ P17       : int  2 0 0 3 1 0 3 0 1 0 ...
##  $ P18       : int  4 0 0 12 4 0 4 0 4 0 ...
##  $ P19       : int  5 3 1 20 2 5 2 3 1 25 ...
##  $ P20       : int  4 2 1 12 2 6 4 5 1 3 ...
##  $ P21       : int  1 1 1 6 1 3 1 2 1 3 ...
##  $ P22       : int  3 3 1 1 2 1 2 4 1 1 ...
##  $ P23       : int  3 2 1 10 1 5 1 2 1 10 ...
##  $ P24       : int  1 0 0 2 2 0 5 0 4 0 ...
##  $ P25       : int  1 0 0 2 3 0 4 0 4 0 ...
##  $ P26       : num  1 0 0 2.5 3 0 4 0 4 0 ...
##  $ P27       : num  4 0 0 2.5 5 0 5 0 2 0 ...
##  $ P28       : num  2 3 1 2.5 1 7.5 1 3 2 5 ...
##  $ P29       : num  3 3 3 7.5 3 5 3 2 3 2.5 ...
##  $ P30       : int  5 0 0 25 5 0 4 0 4 0 ...
##  $ P31       : int  3 0 0 12 1 0 5 0 5 0 ...
##  $ P32       : int  4 0 0 10 3 0 2 0 5 0 ...
##  $ P33       : int  5 0 0 6 2 0 2 0 3 0 ...
##  $ P34       : int  5 0 0 18 3 0 3 0 4 0 ...
##  $ P35       : int  4 0 0 12 4 0 5 0 5 0 ...
##  $ P36       : int  3 0 0 12 3 0 4 0 4 0 ...
##  $ P37       : int  4 0 0 6 3 0 4 0 5 0 ...
##  $ revenue   : num  5653753 6923131 2055379 2675511 4316715 ...
library(psych)
summary(train)
##        Id           Open.Date         City         City.Group Type   
##  Min.   :  0   01/07/2000:  2   Istanbul:50   Big Cities:78   DT: 1  
##  1st Qu.: 34   02/02/2012:  2   Ankara  :19   Other     :59   FC:76  
##  Median : 68   02/23/2010:  2   Izmir   : 9                   IL:60  
##  Mean   : 68   01/03/2014:  1   Bursa   : 5                          
##  3rd Qu.:102   01/09/2010:  1   Samsun  : 5                          
##  Max.   :136   01/17/2009:  1   Antalya : 4                          
##                (Other)   :128   (Other) :45                          
##        P1               P2              P3              P4       
##  Min.   : 1.000   Min.   :1.000   Min.   :0.000   Min.   :3.000  
##  1st Qu.: 2.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median : 3.000   Median :5.000   Median :4.000   Median :4.000  
##  Mean   : 4.015   Mean   :4.409   Mean   :4.318   Mean   :4.372  
##  3rd Qu.: 4.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :12.000   Max.   :7.500   Max.   :7.500   Max.   :7.500  
##                                                                  
##        P5              P6               P7               P8        
##  Min.   :1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.:1.000   1st Qu.: 2.000   1st Qu.: 5.000   1st Qu.: 4.000  
##  Median :2.000   Median : 3.000   Median : 5.000   Median : 5.000  
##  Mean   :2.007   Mean   : 3.358   Mean   : 5.423   Mean   : 5.153  
##  3rd Qu.:2.000   3rd Qu.: 4.000   3rd Qu.: 5.000   3rd Qu.: 5.000  
##  Max.   :8.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                                                                    
##        P9              P10              P11              P12        
##  Min.   : 4.000   Min.   : 4.000   Min.   : 1.000   Min.   : 2.000  
##  1st Qu.: 4.000   1st Qu.: 5.000   1st Qu.: 2.000   1st Qu.: 4.000  
##  Median : 5.000   Median : 5.000   Median : 3.000   Median : 5.000  
##  Mean   : 5.445   Mean   : 5.489   Mean   : 3.263   Mean   : 5.299  
##  3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000   3rd Qu.: 5.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                                                                     
##       P13            P14              P15              P16        
##  Min.   :3.00   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:5.00   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median :5.00   Median : 0.000   Median : 0.000   Median : 0.000  
##  Mean   :5.08   Mean   : 1.416   Mean   : 1.387   Mean   : 1.942  
##  3rd Qu.:5.00   3rd Qu.: 2.000   3rd Qu.: 2.000   3rd Qu.: 3.000  
##  Max.   :7.50   Max.   :15.000   Max.   :10.000   Max.   :15.000  
##                                                                   
##       P17              P18              P19              P20        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 2.000   1st Qu.: 2.000  
##  Median : 0.000   Median : 0.000   Median : 3.000   Median : 4.000  
##  Mean   : 1.036   Mean   : 1.942   Mean   : 4.905   Mean   : 4.547  
##  3rd Qu.: 1.000   3rd Qu.: 4.000   3rd Qu.: 5.000   3rd Qu.: 5.000  
##  Max.   :15.000   Max.   :12.000   Max.   :25.000   Max.   :15.000  
##                                                                     
##       P21             P22             P23              P24        
##  Min.   : 1.00   Min.   :1.000   Min.   : 1.000   Min.   : 0.000  
##  1st Qu.: 1.00   1st Qu.:1.000   1st Qu.: 1.000   1st Qu.: 0.000  
##  Median : 1.00   Median :2.000   Median : 2.000   Median : 0.000  
##  Mean   : 2.27   Mean   :2.226   Mean   : 3.423   Mean   : 1.372  
##  3rd Qu.: 3.00   3rd Qu.:3.000   3rd Qu.: 5.000   3rd Qu.: 2.000  
##  Max.   :15.00   Max.   :5.000   Max.   :25.000   Max.   :10.000  
##                                                                   
##       P25              P26              P27              P28        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 1.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 2.000  
##  Median : 0.000   Median : 0.000   Median : 0.000   Median : 2.500  
##  Mean   : 1.212   Mean   : 1.471   Mean   : 1.146   Mean   : 3.223  
##  3rd Qu.: 2.000   3rd Qu.: 2.500   3rd Qu.: 2.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :12.500   Max.   :12.500   Max.   :12.500  
##                                                                     
##       P29             P30             P31              P32        
##  Min.   :0.000   Min.   : 0.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:2.500   1st Qu.: 0.00   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median :3.000   Median : 0.00   Median : 0.000   Median : 0.000  
##  Mean   :3.135   Mean   : 2.73   Mean   : 1.942   Mean   : 2.526  
##  3rd Qu.:3.000   3rd Qu.: 4.00   3rd Qu.: 3.000   3rd Qu.: 3.000  
##  Max.   :7.500   Max.   :25.00   Max.   :15.000   Max.   :25.000  
##                                                                   
##       P33             P34              P35              P36        
##  Min.   :0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median :0.000   Median : 0.000   Median : 0.000   Median : 0.000  
##  Mean   :1.139   Mean   : 2.489   Mean   : 2.029   Mean   : 2.212  
##  3rd Qu.:2.000   3rd Qu.: 3.000   3rd Qu.: 4.000   3rd Qu.: 3.000  
##  Max.   :6.000   Max.   :24.000   Max.   :15.000   Max.   :20.000  
##                                                                    
##       P37           revenue        
##  Min.   :0.000   Min.   : 1149870  
##  1st Qu.:0.000   1st Qu.: 2999068  
##  Median :0.000   Median : 3939804  
##  Mean   :1.117   Mean   : 4453533  
##  3rd Qu.:2.000   3rd Qu.: 5166635  
##  Max.   :8.000   Max.   :19696939  
## 
attach(train)
xtabs(~City.Group+Type)
##             Type
## City.Group   DT FC IL
##   Big Cities  1 39 38
##   Other       0 37 22
xtabs(~test$City.Group+test$Type)
##                test$Type
## test$City.Group    DT    FC    IL    MB
##      Big Cities  1122 27319 20717   114
##      Other       1122 29700 19730   176
mean(revenue)
## [1] 4453533
train[which(revenue==max(revenue)),c(1,3,43)]
##    Id     City  revenue
## 17 16 Istanbul 19696939
train[which(revenue==min(revenue)),c(1,3,43)]
##    Id     City revenue
## 22 21 Istanbul 1149870
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
qplot(x = revenue, y = City, data = train, color = City.Group, size = Type)
## Warning: Using size for a discrete variable is not advised.

library(corrgram)
corrgram(train[,-c(1,2,3)], order=TRUE, lower.panel=panel.shade,upper.panel=panel.cor,text.panel=panel.txt,main="TFI revenue prediction")

We can see from the corrgram that only the variable P2 and P28 have somewhat of a positive effect on the revenue as seen by a slighlty darker shade of blue.

This dataset has variables P6 to P37 which are not made clear by the dataset providers. However, it is clear that to carry out the process further feature engineering and a deep knowledge clustering techniques and the library random forest is required.