#TITE: MANAGEMENT OF HOTELS IN INDIA"
#NAME: Ayush Bose
#E-mail:boseayush384@gmail.com
#DATE: January 31,2018
#College: JIIT, Noida

#INTRODUCTION
#This project identifies the factors that matter the most. 
#This dataset consists of data from different hotels located in different cities.
#This project is about the hotels in different cities of India.
#It shows the factors which impacts the pricing system of hotels. Like whether the city
#is a tourist place or whether it is a weekend that affets the pricing of a hotel room.

mydata.df <- read.csv(paste("Cities42.csv", sep=""))
View(mydata.df)
summary(mydata.df)
##       CityName      Population          CityRank      IsMetroCity    
##  Delhi    :2048   Min.   :    8096   Min.   : 0.00   Min.   :0.0000  
##  Jaipur   : 768   1st Qu.:  744983   1st Qu.: 2.00   1st Qu.:0.0000  
##  Mumbai   : 712   Median : 3046163   Median : 9.00   Median :0.0000  
##  Bangalore: 656   Mean   : 4416837   Mean   :14.83   Mean   :0.2842  
##  Goa      : 624   3rd Qu.: 8443675   3rd Qu.:24.00   3rd Qu.:1.0000  
##  Kochi    : 608   Max.   :12442373   Max.   :44.00   Max.   :1.0000  
##  (Other)  :7816                                                      
##  IsTouristDestination   IsWeekend       IsNewYearEve             Date     
##  Min.   :0.0000       Min.   :0.0000   Min.   :0.0000   Dec 21 2016:1611  
##  1st Qu.:0.0000       1st Qu.:0.0000   1st Qu.:0.0000   Dec 24 2016:1611  
##  Median :1.0000       Median :1.0000   Median :0.0000   Dec 25 2016:1611  
##  Mean   :0.6972       Mean   :0.6228   Mean   :0.1244   Dec 28 2016:1611  
##  3rd Qu.:1.0000       3rd Qu.:1.0000   3rd Qu.:0.0000   Dec 31 2016:1611  
##  Max.   :1.0000       Max.   :1.0000   Max.   :1.0000   Dec 18 2016:1608  
##                                                         (Other)    :3569  
##                   HotelName        RoomRent        StarRating   
##  Vivanta by Taj        :   32   Min.   :   299   Min.   :0.000  
##  Goldfinch Hotel       :   24   1st Qu.:  2436   1st Qu.:3.000  
##  OYO Rooms             :   24   Median :  4000   Median :3.000  
##  The Gordon House Hotel:   24   Mean   :  5474   Mean   :3.459  
##  Apnayt Villa          :   16   3rd Qu.:  6299   3rd Qu.:4.000  
##  Bentleys Hotel Colaba :   16   Max.   :322500   Max.   :5.000  
##  (Other)               :13096                                   
##     Airport      
##  Min.   :  0.20  
##  1st Qu.:  8.40  
##  Median : 15.00  
##  Mean   : 21.16  
##  3rd Qu.: 24.00  
##  Max.   :124.00  
##                  
##                                                                    HotelAddress  
##  The Mall, Shimla                                                        :   32  
##  #2-91/14/8, White Fields, Kondapur, Hitech City, Hyderabad, 500084 India:   16  
##  121, City Terrace, Walchand Hirachand Marg, Mumbai, Maharashtra         :   16  
##  14-4507/9, Balmatta Road, Near Jyothi Circle, Hampankatta               :   16  
##  144/7, Rajiv Gandi Salai (OMR), Kottivakkam, Chennai, Tamil Nadu        :   16  
##  17, Oliver Road, Colaba, Mumbai, Maharashtra                            :   16  
##  (Other)                                                                 :13120  
##   HotelPincode         HotelDescription    FreeWifi      FreeBreakfast   
##  Min.   : 100025   3           :  120   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 221001   Abc         :  112   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median : 395003   3-star hotel:  104   Median :1.0000   Median :1.0000  
##  Mean   : 397430   3.5         :   88   Mean   :0.9259   Mean   :0.6491  
##  3rd Qu.: 570001   4           :   72   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :7000157   (Other)     :12728   Max.   :1.0000   Max.   :1.0000  
##                    NA's        :    8                                    
##  HotelCapacity    HasSwimmingPool 
##  Min.   :  0.00   Min.   :0.0000  
##  1st Qu.: 16.00   1st Qu.:0.0000  
##  Median : 34.00   Median :0.0000  
##  Mean   : 62.51   Mean   :0.3558  
##  3rd Qu.: 75.00   3rd Qu.:1.0000  
##  Max.   :600.00   Max.   :1.0000  
## 
attach(mydata.df)
head(mydata.df)
##   CityName Population CityRank IsMetroCity IsTouristDestination IsWeekend
## 1   Mumbai   12442373        0           1                    1         1
## 2   Mumbai   12442373        0           1                    1         0
## 3   Mumbai   12442373        0           1                    1         1
## 4   Mumbai   12442373        0           1                    1         1
## 5   Mumbai   12442373        0           1                    1         0
## 6   Mumbai   12442373        0           1                    1         1
##   IsNewYearEve        Date      HotelName RoomRent StarRating Airport
## 1            0 Dec 18 2016 Vivanta by Taj    12375          5      21
## 2            0 Dec 21 2016 Vivanta by Taj    10250          5      21
## 3            0 Dec 24 2016 Vivanta by Taj     9900          5      21
## 4            0 Dec 25 2016 Vivanta by Taj    10350          5      21
## 5            0 Dec 28 2016 Vivanta by Taj    12000          5      21
## 6            1 Dec 31 2016 Vivanta by Taj    11475          5      21
##                                   HotelAddress HotelPincode
## 1 90 Cuffe Parade, Colaba, Mumbai, Maharashtra       400005
## 2 91 Cuffe Parade, Colaba, Mumbai, Maharashtra       400006
## 3 92 Cuffe Parade, Colaba, Mumbai, Maharashtra       400007
## 4 93 Cuffe Parade, Colaba, Mumbai, Maharashtra       400008
## 5 94 Cuffe Parade, Colaba, Mumbai, Maharashtra       400009
## 6 95 Cuffe Parade, Colaba, Mumbai, Maharashtra       400010
##                               HotelDescription FreeWifi FreeBreakfast
## 1 Luxury hotel with spa, near Gateway of India        1             0
## 2 Luxury hotel with spa, near Gateway of India        1             0
## 3 Luxury hotel with spa, near Gateway of India        1             0
## 4 Luxury hotel with spa, near Gateway of India        1             0
## 5 Luxury hotel with spa, near Gateway of India        1             0
## 6 Luxury hotel with spa, near Gateway of India        1             0
##   HotelCapacity HasSwimmingPool
## 1           287               1
## 2           287               1
## 3           287               1
## 4           287               1
## 5           287               1
## 6           287               1
dim(mydata.df)
## [1] 13232    19
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(mydata.df)
##                      vars     n       mean         sd  median    trimmed
## CityName*               1 13232      18.07      11.72      16      17.29
## Population              2 13232 4416836.87 4258386.00 3046163 4040816.22
## CityRank                3 13232      14.83      13.51       9      13.30
## IsMetroCity             4 13232       0.28       0.45       0       0.23
## IsTouristDestination    5 13232       0.70       0.46       1       0.75
## IsWeekend               6 13232       0.62       0.48       1       0.65
## IsNewYearEve            7 13232       0.12       0.33       0       0.03
## Date*                   8 13232      14.30       2.69      14      14.39
## HotelName*              9 13232     841.19     488.16     827     841.18
## RoomRent               10 13232    5473.99    7333.12    4000    4383.33
## StarRating             11 13232       3.46       0.76       3       3.40
## Airport                12 13232      21.16      22.76      15      16.39
## HotelAddress*          13 13232    1202.53     582.17    1261    1233.25
## HotelPincode           14 13232  397430.26  259837.50  395003  388540.47
## HotelDescription*      15 13224     581.34     363.26     567     575.37
## FreeWifi               16 13232       0.93       0.26       1       1.00
## FreeBreakfast          17 13232       0.65       0.48       1       0.69
## HotelCapacity          18 13232      62.51      76.66      34      46.03
## HasSwimmingPool        19 13232       0.36       0.48       0       0.32
##                             mad      min      max      range  skew
## CityName*                 11.86      1.0       42       41.0  0.48
## Population           3846498.95   8096.0 12442373 12434277.0  0.68
## CityRank                  11.86      0.0       44       44.0  0.69
## IsMetroCity                0.00      0.0        1        1.0  0.96
## IsTouristDestination       0.00      0.0        1        1.0 -0.86
## IsWeekend                  0.00      0.0        1        1.0 -0.51
## IsNewYearEve               0.00      0.0        1        1.0  2.28
## Date*                      2.97      1.0       20       19.0 -0.77
## HotelName*               641.97      1.0     1670     1669.0  0.01
## RoomRent                2653.85    299.0   322500   322201.0 16.75
## StarRating                 0.74      0.0        5        5.0  0.48
## Airport                   11.12      0.2      124      123.8  2.73
## HotelAddress*            668.65      1.0     2108     2107.0 -0.37
## HotelPincode          257975.37 100025.0  7000157  6900132.0  9.99
## HotelDescription*        472.95      1.0     1226     1225.0  0.11
## FreeWifi                   0.00      0.0        1        1.0 -3.25
## FreeBreakfast              0.00      0.0        1        1.0 -0.62
## HotelCapacity             28.17      0.0      600      600.0  2.95
## HasSwimmingPool            0.00      0.0        1        1.0  0.60
##                      kurtosis       se
## CityName*               -0.88     0.10
## Population              -1.08 37019.65
## CityRank                -0.76     0.12
## IsMetroCity             -1.08     0.00
## IsTouristDestination    -1.26     0.00
## IsWeekend               -1.74     0.00
## IsNewYearEve             3.18     0.00
## Date*                    1.92     0.02
## HotelName*              -1.25     4.24
## RoomRent               582.06    63.75
## StarRating               0.25     0.01
## Airport                  7.89     0.20
## HotelAddress*           -0.88     5.06
## HotelPincode           249.76  2258.86
## HotelDescription*       -1.25     3.16
## FreeWifi                 8.57     0.00
## FreeBreakfast           -1.61     0.00
## HotelCapacity           11.39     0.67
## HasSwimmingPool         -1.64     0.00
#One way contingency table
mytable <- with(mydata.df, table(StarRating))
mytable
## StarRating
##    0    1    2  2.5    3  3.2  3.3  3.4  3.5  3.6  3.7  3.8  3.9    4  4.1 
##   16    8  440  632 5953    8   16    8 1752    8   24   16   32 2463   24 
##  4.3  4.4  4.5  4.7  4.8    5 
##   16    8  376    8   16 1408
mytable2 <- with(mydata.df, table(FreeBreakfast))
mytable2
## FreeBreakfast
##    0    1 
## 4643 8589
mytable3 <- with(mydata.df, table(CityRank))
mytable3
## CityRank
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  712 2048  656  416  536  424  512   80  600  768   32  128   16  136  160 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  432  448  624  128  264   40  224  336  392   48  160  120  272  104  456 
##   32   33   34   35   36   37   38   39   40   42   43   44 
##   48   56  280   64  136   88  128  136  264  144  328  288
#Two way contingency table
mytable <- xtabs(~ FreeBreakfast+StarRating, data=mydata.df)
mytable
##              StarRating
## FreeBreakfast    0    1    2  2.5    3  3.2  3.3  3.4  3.5  3.6  3.7  3.8
##             0   16    0  216  296 1789    0    8    0  661    8    0    8
##             1    0    8  224  336 4164    8    8    8 1091    0   24    8
##              StarRating
## FreeBreakfast  3.9    4  4.1  4.3  4.4  4.5  4.7  4.8    5
##             0   16  783    0   16    0  224    8    0  594
##             1   16 1680   24    0    8  152    0   16  814
mytable1 <- xtabs(~ IsMetroCity+StarRating, data=mydata.df)
mytable1
##            StarRating
## IsMetroCity    0    1    2  2.5    3  3.2  3.3  3.4  3.5  3.6  3.7  3.8
##           0   16    8  344  456 4336    8   16    8 1312    0   24   16
##           1    0    0   96  176 1617    0    0    0  440    8    0    0
##            StarRating
## IsMetroCity  3.9    4  4.1  4.3  4.4  4.5  4.7  4.8    5
##           0   32 1696   24   16    8  288    8   16  840
##           1    0  767    0    0    0   88    0    0  568
mytable2 <- xtabs(~ IsMetroCity+IsTouristDestination, data=mydata.df)
mytable2
##            IsTouristDestination
## IsMetroCity    0    1
##           0 3352 6120
##           1  655 3105
#HISTOGRAMS
hist(mydata.df$RoomRent, 
     main="Analysis of room rents of hotels",
     xlab="Rents of room", ylab="Relative frequency",
     breaks=30, col="lightblue", freq=FALSE)

hist(mydata.df$StarRating, 
     main="Analysis of star ratings of hotels",
     xlab="Star ratings", ylab="Relative frequency",
     breaks=30, col="red", freq=FALSE)

hist(mydata.df$HotelCapacity, main= "Hotel Capacity" ,xlab="Capacity" ,col = "green")

hist(mydata.df$Population, main= "Population" ,xlab="Population" ,col = "yellow")

#BOXPLOTS
boxplot(mydata.df$StarRating , horizontal =TRUE,main="Star Rating",
        col = "lightblue" )

boxplot(mydata.df$CityRank ~ mydata.df$FreeBreakfast , horizontal =TRUE,
main="Availability of Wifi and Breakfast", xlab="Rank of city",
ylab="Breakfast availability",las =1 ,col=c("red","pink") )

boxplot(mydata.df$RoomRent ~ mydata.df$IsMetroCity, horizontal=TRUE,
        ylab="City", xlab="Room rent", las=1,
        main="Analysis of metro city and room rent of hotels",
        col=c("brown","green"))

#SCATTER PLOT
library(car)
## Warning: package 'car' was built under R version 3.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(RoomRent~StarRating,     data=mydata.df,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Star Rating vs Room rent",
            ylab="Room Rent",
            xlab="Star Rating")

scatterplotMatrix(formula = ~ RoomRent + IsWeekend + IsNewYearEve +Airport , 
                  data = mydata.df, pch = 16)
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

scatterplot(x = mydata.df$RoomRent , y = mydata.df$StarRating,
            main="Room Rent Vs Star Rating " , xlab="Room Rent", ylab="Star Rating")

#Corrgrams
cor(mydata.df[, c(2,3,4,5,6,7,10,11,18)])
##                         Population      CityRank   IsMetroCity
## Population            1.0000000000 -0.8353204432  0.7712260105
## CityRank             -0.8353204432  1.0000000000 -0.5643937903
## IsMetroCity           0.7712260105 -0.5643937903  1.0000000000
## IsTouristDestination -0.0482029722  0.2807134520  0.1763717063
## IsWeekend             0.0115926802 -0.0072564766  0.0018118005
## IsNewYearEve          0.0007332482 -0.0006326444  0.0006464753
## RoomRent             -0.0887280632  0.0939855292 -0.0668397705
## StarRating            0.1341365933 -0.1333810133  0.0776028661
## HotelCapacity         0.2599830516 -0.2561197059  0.1871502153
##                      IsTouristDestination    IsWeekend  IsNewYearEve
## Population                   -0.048202972  0.011592680  0.0007332482
## CityRank                      0.280713452 -0.007256477 -0.0006326444
## IsMetroCity                   0.176371706  0.001811801  0.0006464753
## IsTouristDestination          1.000000000 -0.019481101 -0.0022663884
## IsWeekend                    -0.019481101  1.000000000  0.2923820508
## IsNewYearEve                 -0.002266388  0.292382051  1.0000000000
## RoomRent                      0.122502963  0.004580134  0.0384912269
## StarRating                   -0.040554998  0.006378436  0.0023608970
## HotelCapacity                -0.094356091  0.006306507  0.0013526790
##                          RoomRent   StarRating HotelCapacity
## Population           -0.088728063  0.134136593   0.259983052
## CityRank              0.093985529 -0.133381013  -0.256119706
## IsMetroCity          -0.066839771  0.077602866   0.187150215
## IsTouristDestination  0.122502963 -0.040554998  -0.094356091
## IsWeekend             0.004580134  0.006378436   0.006306507
## IsNewYearEve          0.038491227  0.002360897   0.001352679
## RoomRent              1.000000000  0.369373425   0.157873308
## StarRating            0.369373425  1.000000000   0.637430337
## HotelCapacity         0.157873308  0.637430337   1.000000000
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(mydata.df, lower.panel = panel.shade, upper.panel = panel.pie, 
         text.panel = panel.txt, main = "Corrgram of all  variables")

#Correlation tests
cor.test(mydata.df$RoomRent, mydata.df$StarRating)
## 
##  Pearson's product-moment correlation
## 
## data:  mydata.df$RoomRent and mydata.df$StarRating
## t = 45.719, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3545660 0.3839956
## sample estimates:
##       cor 
## 0.3693734
cor.test(mydata.df$Population, mydata.df$CityRank)
## 
##  Pearson's product-moment correlation
## 
## data:  mydata.df$Population and mydata.df$CityRank
## t = -174.77, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8403980 -0.8300962
## sample estimates:
##        cor 
## -0.8353204
cor.test(mydata.df$RoomRent, mydata.df$IsNewYearEve)
## 
##  Pearson's product-moment correlation
## 
## data:  mydata.df$RoomRent and mydata.df$IsNewYearEve
## t = 4.4306, df = 13230, p-value = 9.472e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02146637 0.05549377
## sample estimates:
##        cor 
## 0.03849123
#Chi-square Test
chisq.test(mydata.df$RoomRent)
## 
##  Chi-squared test for given probabilities
## 
## data:  mydata.df$RoomRent
## X-squared = 129980000, df = 13231, p-value < 2.2e-16
#p-value < 2.2e-16 (<0.05). We can reject the Null Hypothesis and accept Alternate Hypothesis.

#T-test

#Null Hypothesis - Their is no Difference between the Room Rent on new year's eve 
#and on other days
t.test(mydata.df$RoomRent ~ mydata.df$IsNewYearEve)
## 
##  Welch Two Sample t-test
## 
## data:  mydata.df$RoomRent by mydata.df$IsNewYearEve
## t = -4.1793, df = 2065, p-value = 3.046e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1256.5297  -453.9099
## sample estimates:
## mean in group 0 mean in group 1 
##        5367.606        6222.826
#P-Value = 3.046e-05 (<0.05) Which is small enough 
#for Rejecting the Null Hupothesis and accepting Alternaye Hypothesis.
#Hence there is a significant difference 
#between the Room Rent on new year's eve and on other days.

#Null Hypothesis - Their is no Difference between the Room Rent on weekends 
#and on other days
t.test(mydata.df$RoomRent ~ mydata.df$IsWeekend)
## 
##  Welch Two Sample t-test
## 
## data:  mydata.df$RoomRent by mydata.df$IsWeekend
## t = -0.51853, df = 9999.4, p-value = 0.6041
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -331.2427  192.6559
## sample estimates:
## mean in group 0 mean in group 1 
##        5430.835        5500.129
#P-Value = 0.6041 (>0.05). So we fail to reject Null Hupothesis
#Hence there is no significant difference 
#between the Room Rent on weekends and on other days.