Hotel Room Pricing In The Indian Market

Name: Krushali Shah

College : VJTI College, Mumbai

Reading the data

setwd("C:/Users/Dell/Downloads/Sameer Mathur")
hotel.df<- read.csv("Cities42.csv")
View(hotel.df)

Changing irregularities of dates

hotel.df$Date<-gsub("18-Dec-16", "Dec 18 2016", hotel.df$Date)
hotel.df$Date<-gsub("21-Dec-16", "Dec 21 2016", hotel.df$Date)
hotel.df$Date<-gsub("24-Dec-16", "Dec 24 2016", hotel.df$Date)
hotel.df$Date<-gsub("25-Dec-16", "Dec 25 2016", hotel.df$Date)
hotel.df$Date<-gsub("28-Dec-16", "Dec 28 2016", hotel.df$Date)
hotel.df$Date<-gsub("31-Dec-16", "Dec 31 2016", hotel.df$Date)
hotel.df$Date<-gsub("04-Jan-17", "Jan 04 2017", hotel.df$Date)
hotel.df$Date<-gsub("04-Jan-16", "Jan 04 2017", hotel.df$Date)
hotel.df$Date<-gsub("08-Jan-16", "Jan 08 2017", hotel.df$Date)
hotel.df$Date<-gsub("08-Jan-17", "Jan 08 2017", hotel.df$Date)
hotel.df$Date<-gsub("Jan 4 2017", "Jan 04 2017", hotel.df$Date)
hotel.df$Date<-gsub("Jan 8 2017", "Jan 08 2017", hotel.df$Date)

Checking dates

table(hotel.df$Date)
## 
## Dec 18 2016 Dec 21 2016 Dec 24 2016 Dec 25 2016 Dec 28 2016 Dec 31 2016 
##        1652        1655        1655        1655        1655        1655 
## Jan 04 2017 Jan 08 2017 
##        1652        1653

Changing dates to factors for labelling

hotel.df$Date<-factor(hotel.df$Date)
is.factor(hotel.df$Date)
## [1] TRUE

Checking the labelling

levels(hotel.df$Date)
## [1] "Dec 18 2016" "Dec 21 2016" "Dec 24 2016" "Dec 25 2016" "Dec 28 2016"
## [6] "Dec 31 2016" "Jan 04 2017" "Jan 08 2017"

Summary of all the variables

summary(hotel.df)
##       CityName      Population          CityRank      IsMetroCity    
##  Delhi    :2048   Min.   :    8096   Min.   : 0.00   Min.   :0.0000  
##  Jaipur   : 768   1st Qu.:  744983   1st Qu.: 2.00   1st Qu.:0.0000  
##  Mumbai   : 712   Median : 3046163   Median : 9.00   Median :0.0000  
##  Bangalore: 656   Mean   : 4416837   Mean   :14.83   Mean   :0.2842  
##  Goa      : 624   3rd Qu.: 8443675   3rd Qu.:24.00   3rd Qu.:1.0000  
##  Kochi    : 608   Max.   :12442373   Max.   :44.00   Max.   :1.0000  
##  (Other)  :7816                                                      
##  IsTouristDestination   IsWeekend       IsNewYearEve             Date     
##  Min.   :0.0000       Min.   :0.0000   Min.   :0.0000   Dec 21 2016:1655  
##  1st Qu.:0.0000       1st Qu.:0.0000   1st Qu.:0.0000   Dec 24 2016:1655  
##  Median :1.0000       Median :1.0000   Median :0.0000   Dec 25 2016:1655  
##  Mean   :0.6972       Mean   :0.6228   Mean   :0.1244   Dec 28 2016:1655  
##  3rd Qu.:1.0000       3rd Qu.:1.0000   3rd Qu.:0.0000   Dec 31 2016:1655  
##  Max.   :1.0000       Max.   :1.0000   Max.   :1.0000   Jan 08 2017:1653  
##                                                         (Other)    :3304  
##                   HotelName        RoomRent        StarRating   
##  Vivanta by Taj        :   32   Min.   :   299   Min.   :0.000  
##  Goldfinch Hotel       :   24   1st Qu.:  2436   1st Qu.:3.000  
##  OYO Rooms             :   24   Median :  4000   Median :3.000  
##  The Gordon House Hotel:   24   Mean   :  5474   Mean   :3.459  
##  Apnayt Villa          :   16   3rd Qu.:  6299   3rd Qu.:4.000  
##  Bentleys Hotel Colaba :   16   Max.   :322500   Max.   :5.000  
##  (Other)               :13096                                   
##     Airport      
##  Min.   :  0.20  
##  1st Qu.:  8.40  
##  Median : 15.00  
##  Mean   : 21.16  
##  3rd Qu.: 24.00  
##  Max.   :124.00  
##                  
##                                                                    HotelAddress  
##  The Mall, Shimla                                                        :   32  
##  #2-91/14/8, White Fields, Kondapur, Hitech City, Hyderabad, 500084 India:   16  
##  121, City Terrace, Walchand Hirachand Marg, Mumbai, Maharashtra         :   16  
##  14-4507/9, Balmatta Road, Near Jyothi Circle, Hampankatta               :   16  
##  144/7, Rajiv Gandi Salai (OMR), Kottivakkam, Chennai, Tamil Nadu        :   16  
##  17, Oliver Road, Colaba, Mumbai, Maharashtra                            :   16  
##  (Other)                                                                 :13120  
##   HotelPincode         HotelDescription    FreeWifi      FreeBreakfast   
##  Min.   : 100025   3           :  120   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 221001   Abc         :  112   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median : 395003   3-star hotel:  104   Median :1.0000   Median :1.0000  
##  Mean   : 397430   3.5         :   88   Mean   :0.9259   Mean   :0.6491  
##  3rd Qu.: 570001   4           :   72   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :7000157   (Other)     :12728   Max.   :1.0000   Max.   :1.0000  
##                    NA's        :    8                                    
##  HotelCapacity    HasSwimmingPool 
##  Min.   :  0.00   Min.   :0.0000  
##  1st Qu.: 16.00   1st Qu.:0.0000  
##  Median : 34.00   Median :0.0000  
##  Mean   : 62.51   Mean   :0.3558  
##  3rd Qu.: 75.00   3rd Qu.:1.0000  
##  Max.   :600.00   Max.   :1.0000  
## 
library(psych)
describe(hotel.df)
##                      vars     n       mean         sd  median    trimmed
## CityName*               1 13232      18.07      11.72      16      17.29
## Population              2 13232 4416836.87 4258386.00 3046163 4040816.22
## CityRank                3 13232      14.83      13.51       9      13.30
## IsMetroCity             4 13232       0.28       0.45       0       0.23
## IsTouristDestination    5 13232       0.70       0.46       1       0.75
## IsWeekend               6 13232       0.62       0.48       1       0.65
## IsNewYearEve            7 13232       0.12       0.33       0       0.03
## Date*                   8 13232       4.50       2.29       4       4.50
## HotelName*              9 13232     841.19     488.16     827     841.18
## RoomRent               10 13232    5473.99    7333.12    4000    4383.33
## StarRating             11 13232       3.46       0.76       3       3.40
## Airport                12 13232      21.16      22.76      15      16.39
## HotelAddress*          13 13232    1202.53     582.17    1261    1233.25
## HotelPincode           14 13232  397430.26  259837.50  395003  388540.47
## HotelDescription*      15 13224     581.34     363.26     567     575.37
## FreeWifi               16 13232       0.93       0.26       1       1.00
## FreeBreakfast          17 13232       0.65       0.48       1       0.69
## HotelCapacity          18 13232      62.51      76.66      34      46.03
## HasSwimmingPool        19 13232       0.36       0.48       0       0.32
##                             mad      min      max      range  skew
## CityName*                 11.86      1.0       42       41.0  0.48
## Population           3846498.95   8096.0 12442373 12434277.0  0.68
## CityRank                  11.86      0.0       44       44.0  0.69
## IsMetroCity                0.00      0.0        1        1.0  0.96
## IsTouristDestination       0.00      0.0        1        1.0 -0.86
## IsWeekend                  0.00      0.0        1        1.0 -0.51
## IsNewYearEve               0.00      0.0        1        1.0  2.28
## Date*                      2.97      1.0        8        7.0  0.00
## HotelName*               641.97      1.0     1670     1669.0  0.01
## RoomRent                2653.85    299.0   322500   322201.0 16.75
## StarRating                 0.74      0.0        5        5.0  0.48
## Airport                   11.12      0.2      124      123.8  2.73
## HotelAddress*            668.65      1.0     2108     2107.0 -0.37
## HotelPincode          257975.37 100025.0  7000157  6900132.0  9.99
## HotelDescription*        472.95      1.0     1226     1225.0  0.11
## FreeWifi                   0.00      0.0        1        1.0 -3.25
## FreeBreakfast              0.00      0.0        1        1.0 -0.62
## HotelCapacity             28.17      0.0      600      600.0  2.95
## HasSwimmingPool            0.00      0.0        1        1.0  0.60
##                      kurtosis       se
## CityName*               -0.88     0.10
## Population              -1.08 37019.65
## CityRank                -0.76     0.12
## IsMetroCity             -1.08     0.00
## IsTouristDestination    -1.26     0.00
## IsWeekend               -1.74     0.00
## IsNewYearEve             3.18     0.00
## Date*                   -1.24     0.02
## HotelName*              -1.25     4.24
## RoomRent               582.06    63.75
## StarRating               0.25     0.01
## Airport                  7.89     0.20
## HotelAddress*           -0.88     5.06
## HotelPincode           249.76  2258.86
## HotelDescription*       -1.25     3.16
## FreeWifi                 8.57     0.00
## FreeBreakfast           -1.61     0.00
## HotelCapacity           11.39     0.67
## HasSwimmingPool         -1.64     0.00

Dependent Variable: Room rent Trying to find the independent variable which influence the dependent variable the most with the help of a corrgram

library(corrgram)
corrgram(hotel.df,order=TRUE, lower.panel = panel.shade,upper.panel = panel.pie, text.panel = panel.txt,main="Corrgram of all the factors affecting Hotel Room Pricing")

From the above corrgram we can see that the three factors affecting the room rent the most are the Star Rating, Hotel capacity, Swimming pool facility in the hotel.

Testing each individual independently now.

Swimming pool

table(hotel.df$HasSwimmingPool)
## 
##    0    1 
## 8524 4708
Swim<-table(hotel.df$HasSwimmingPool)
barplot(Swim,main="Barrplot of Hotel Swimming Pool")

From the bar plot we can see that there are lesser number of hotels having a swimming pool than the ones having one.

Star Rating

table(hotel.df$StarRating)
## 
##    0    1    2  2.5    3  3.2  3.3  3.4  3.5  3.6  3.7  3.8  3.9    4  4.1 
##   16    8  440  632 5953    8   16    8 1752    8   24   16   32 2463   24 
##  4.3  4.4  4.5  4.7  4.8    5 
##   16    8  376    8   16 1408
starRating<-table(hotel.df$StarRating)
barplot(starRating,main = "Barrplot for Star Rating")

From the bar plot we can see that maximum number of hotels have a 3-star rating.

Hotel Capacity

boxplot(hotel.df$HotelCapacity, main="Boxplot for Hotel Capacity",horizontal = TRUE)

Testing each variable pair-wise with Room rent

Drawing scatterplots for all the pairs

plot(hotel.df$StarRating,hotel.df$RoomRent,main="RoomRent of Hotels  with StarRating",ylab = "RoomRent in INR", xlab="Star rating out of 5",cex=1.0)

plot(hotel.df$RoomRent,hotel.df$HotelCapacity,main="RoomRent of Hotels  with Hotel capacity",ylab = "Hotel Capacity in rooms", xlab="RoomRent in INR",cex=1.0)

plot(hotel.df$RoomRent,hotel.df$HasSwimmingPool,main="RoomRent of Hotels  with HasSwimmingPool",ylab = "Has Swimmng Pool ", xlab="RoomRent",cex=1.0)

Plotting a corrgram of the most important variables

xy<- data.frame(hotel.df$RoomRent,hotel.df$StarRating,hotel.df$HasSwimmingPool,hotel.df$HotelCapacity)
corrgram(xy, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie , text.panel = panel.txt)

Creating a corelation and covariance matrix

x<- hotel.df[,c("StarRating","HotelCapacity","HasSwimmingPool")]
y<- hotel.df[,c("RoomRent")]
cor(x,y)
##                      [,1]
## StarRating      0.3693734
## HotelCapacity   0.1578733
## HasSwimmingPool 0.3116577
cov(x,y)
##                      [,1]
## StarRating       2048.375
## HotelCapacity   88753.413
## HasSwimmingPool  1094.202

Creating a data frame with rent of rooms lesser than 100000 because the average gets affected by outliers

RoomRent1.df <-hotel.df[which(hotel.df$RoomRent<100000),]

Visualizing the effect of other variables

IsMetroCity

metro<- table(hotel.df$IsMetroCity)
barplot(metro, main="Distribution of IsMetroCity", xlab="Not a Metro city(0)         Is a Metro City(1)", col="blue")

aggregate(hotel.df$RoomRent,by=list(hotel.df$IsMetroCity),mean)
##   Group.1        x
## 1       0 5782.794
## 2       1 4696.073
boxplot(RoomRent~IsMetroCity,data=hotel.df, main="Room rent vs. IsMetroCity", ylab="Not a Metro city(0) or Metro City(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)

boxplot(RoomRent~IsMetroCity,data=RoomRent1.df, main="Room rent vs. IsMetroCity", ylab="Not a Metro city(0) or Metro City(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)

IsTouristDestination

tourist<- table(hotel.df$IsTouristDestination)
barplot(tourist, main="Distribution of IsTouristDestination", xlab="Not a Tourist Destination(0)         Is a Tourist Destination(1)", col="blue")

aggregate(hotel.df$RoomRent,by=list(hotel.df$IsTouristDestination),mean)
##   Group.1        x
## 1       0 4111.003
## 2       1 6066.024
boxplot(RoomRent~IsTouristDestination,data=hotel.df, main="Room rent vs. IsTouristDestination", ylab="Not a Tourist Destination(0)         Is a Tourist Destination(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)

boxplot(RoomRent~IsTouristDestination,data=RoomRent1.df, main="Room rent vs. IsTouristDestination", ylab="Not a Tourist Destination(0)         Is a Tourist Destination(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)

From the above boxplot we can see that the rent of a room in a tourist place is more than that in a non tourist place.

IsWeekend

weekend<- table(hotel.df$IsWeekend)
barplot(weekend, main="Distribution of IsWeekend", xlab="Not a Weekend(0)         Is a Weekend(1)", col="blue")

aggregate(hotel.df$RoomRent,by=list(hotel.df$IsWeekend),mean)
##   Group.1        x
## 1       0 5430.835
## 2       1 5500.129
boxplot(RoomRent~IsWeekend,data=hotel.df, main="Room rent vs. IsWeekend", ylab="Not a Weekend(0)         Is a Weekend(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)

boxplot(RoomRent~IsWeekend,data=RoomRent1.df, main="Room rent vs. IsWeekend", ylab="Not a Weekend(0)         Is a Weekend(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)

Dates

date<- table(hotel.df$Date)
barplot(date, main="Distribution of Dates", col="blue")

d=aggregate(hotel.df$RoomRent,by=list(hotel.df$Date),mean)
d
##       Group.1        x
## 1 Dec 18 2016 4896.402
## 2 Dec 21 2016 5085.315
## 3 Dec 24 2016 5543.236
## 4 Dec 25 2016 5464.143
## 5 Dec 28 2016 5593.924
## 6 Dec 31 2016 6191.776
## 7 Jan 04 2017 5674.062
## 8 Jan 08 2017 5342.234
boxplot(RoomRent~Date,data=hotel.df, main="Room rent vs. Dates",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

boxplot(RoomRent~Date,data=RoomRent1.df, main="Room rent vs. Dates",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

From the above boxplots we can see that the room rents are the highest on 31st December.

Airport Distance

airport<- table(hotel.df$Airport)
barplot(airport, main="Distribution of Distance from Airport", col="blue")

aggregate(hotel.df$RoomRent,by=list(hotel.df$Airport),mean)
##     Group.1         x
## 1       0.2  5247.000
## 2       0.3  2217.500
## 3       0.4  4987.025
## 4       0.5  2437.458
## 5       0.6  4805.781
## 6       0.7  2894.750
## 7       0.8  4687.500
## 8       0.9  4356.128
## 9       1.0  4452.812
## 10      1.1 43487.500
## 11      1.2  5154.050
## 12      1.4 15562.500
## 13      1.5  7273.250
## 14      1.6 10460.469
## 15      1.7  4968.091
## 16      1.8  3298.681
## 17      1.9  6440.400
## 18      2.0  5804.821
## 19      2.1  3400.975
## 20      2.2  4448.000
## 21      2.3  8030.938
## 22      2.4  5419.656
## 23      2.5  5863.000
## 24      2.6  3027.396
## 25      2.7  4659.054
## 26      2.8  4563.417
## 27      2.9  3667.232
## 28      3.0  4636.929
## 29      3.1  5859.875
## 30      3.2  7848.667
## 31      3.3  8960.125
## 32      3.4  6088.854
## 33      3.5  4267.839
## 34      3.6  6699.828
## 35      3.7  2603.125
## 36      3.8  3145.825
## 37      3.9  4075.250
## 38      4.0  4013.944
## 39      4.1  4218.688
## 40      4.2  3857.075
## 41      4.3  4388.656
## 42      4.4  5525.062
## 43      4.5  6135.208
## 44      4.6  5061.025
## 45      4.7  2378.125
## 46      4.8  3889.250
## 47      4.9  3421.219
## 48      5.0  6144.041
## 49      5.1  6547.000
## 50      5.2  4390.306
## 51      5.3  4119.250
## 52      5.4  4705.900
## 53      5.5  4371.625
## 54      5.6  5430.900
## 55      5.7  6046.594
## 56      5.8  5625.607
## 57      5.9  5016.650
## 58      6.0  3875.970
## 59      6.1  5853.375
## 60      6.2  2701.828
## 61      6.3  1705.125
## 62      6.4  3212.479
## 63      6.5  4871.208
## 64      6.6  3623.625
## 65      6.7  2994.292
## 66      6.8  4132.357
## 67      6.9  2749.950
## 68      7.0  4028.469
## 69      7.1  2366.806
## 70      7.2  3611.604
## 71      7.3  2522.042
## 72      7.4  3571.975
## 73      7.5  4816.667
## 74      7.6  4420.141
## 75      7.7 27828.708
## 76      7.8  5656.594
## 77      7.9  2730.958
## 78      8.0  9879.685
## 79      8.1  5120.639
## 80      8.2  3684.607
## 81      8.3  5117.825
## 82      8.4  4164.208
## 83      8.5  2991.688
## 84      8.6  1378.938
## 85      8.7  4457.446
## 86      8.8  3498.562
## 87      8.9  2627.250
## 88      9.0  3618.694
## 89      9.1  5122.458
## 90      9.2  9520.790
## 91      9.3  4147.375
## 92      9.4  5233.500
## 93      9.5  8224.909
## 94      9.6  3226.050
## 95      9.7  4376.083
## 96      9.8  4045.625
## 97      9.9  7947.732
## 98     10.0  5184.302
## 99     10.2  2780.000
## 100    10.3  2587.000
## 101    10.4  2346.000
## 102    10.6  1574.375
## 103    10.7  7025.000
## 104    10.8 12157.875
## 105    10.9  1949.812
## 106    11.0  4665.262
## 107    11.1  2867.250
## 108    11.3  1948.812
## 109    11.7  4069.000
## 110    11.9  7264.938
## 111    12.0  5014.164
## 112    12.2  3113.458
## 113    12.3  1746.750
## 114    12.6  4241.000
## 115    12.7  4566.750
## 116    13.0  6872.332
## 117    13.1  2525.000
## 118    13.3  4881.250
## 119    13.5  1831.250
## 120    13.6  4371.333
## 121    13.7  5908.812
## 122    13.8  2507.500
## 123    14.0  3632.243
## 124    14.2  1801.000
## 125    14.4  4002.167
## 126    14.5  3847.500
## 127    14.6  6711.438
## 128    14.7  5431.167
## 129    14.8  7086.625
## 130    14.9  4631.250
## 131    15.0  4804.245
## 132    15.3  2983.875
## 133    15.4  5179.125
## 134    15.6  4233.375
## 135    15.7  3385.250
## 136    15.8  5960.500
## 137    15.9  9961.875
## 138    16.0  5052.724
## 139    16.1 10451.000
## 140    16.2  4637.250
## 141    16.4  2404.250
## 142    16.5  4639.250
## 143    16.7  6648.281
## 144    17.0  5245.613
## 145    17.1  3251.000
## 146    17.2  4874.500
## 147    17.4  1911.750
## 148    17.5 16538.125
## 149    17.6  6273.000
## 150    17.8  4139.438
## 151    18.0  5023.542
## 152    18.3  6125.000
## 153    18.5  3543.250
## 154    18.6  6693.750
## 155    18.7  2782.625
## 156    19.0 10216.920
## 157    19.5  2262.500
## 158    19.9  7232.500
## 159    20.0  5474.096
## 160    20.2  8412.500
## 161    20.3  3930.812
## 162    20.5  2169.625
## 163    20.9  6281.750
## 164    21.0  4546.419
## 165    21.4  6944.500
## 166    21.5  3882.750
## 167    22.0  4453.590
## 168    22.1  5305.000
## 169    22.2  3235.000
## 170    22.4  3887.500
## 171    22.5  6103.250
## 172    23.0  5019.740
## 173    23.2 10887.500
## 174    23.3  5088.000
## 175    23.4  4942.375
## 176    24.0  3863.335
## 177    24.2 38115.625
## 178    24.3 16894.500
## 179    24.5  5305.750
## 180    24.6 45274.375
## 181    24.7  2078.000
## 182    24.9 20867.438
## 183    25.0  5229.457
## 184    25.6  7140.625
## 185    25.7  6137.500
## 186    25.9 15937.500
## 187    26.0  6258.703
## 188    26.1 26156.250
## 189    26.3  2369.250
## 190    26.4  7483.000
## 191    26.5  6112.500
## 192    26.7  7992.500
## 193    27.0  5835.206
## 194    27.1 23437.500
## 195    27.2  4832.000
## 196    28.0  3282.277
## 197    28.1  7140.625
## 198    28.6  7518.750
## 199    28.7  3781.625
## 200    29.0  3602.364
## 201    30.0  5784.393
## 202    30.5 20500.000
## 203    31.0  4943.406
## 204    31.2  6193.750
## 205    31.3  9125.000
## 206    31.9  4204.750
## 207    32.0  5803.528
## 208    32.9  7936.875
## 209    33.0  3026.100
## 210    33.4  6292.000
## 211    34.0  5784.875
## 212    35.0  8111.898
## 213    36.0  7528.882
## 214    36.2  6871.500
## 215    37.0  8712.878
## 216    38.0  6006.755
## 217    38.3  8117.875
## 218    39.0  4524.650
## 219    39.9  2206.500
## 220    40.0  5576.768
## 221    41.0  5355.676
## 222    42.0  3292.293
## 223    42.7  4118.750
## 224    43.0  7559.758
## 225    43.9  9247.500
## 226    44.0  5925.000
## 227    44.5  4233.125
## 228    44.6  7147.000
## 229    44.8 33033.500
## 230    46.0  4236.850
## 231    47.0  7256.000
## 232    47.5 19108.125
## 233    48.0  4268.750
## 234    48.4  3000.000
## 235    49.0 18237.500
## 236    50.0  5681.875
## 237    50.1  2360.875
## 238    50.5  3417.750
## 239    51.0  3178.250
## 240    52.0  4198.375
## 241    52.7  7820.000
## 242    53.0  4062.500
## 243    55.0 18950.000
## 244    57.2 15375.000
## 245    60.0  2846.000
## 246    61.0 14319.062
## 247    62.0  5412.719
## 248    63.0  8687.500
## 249    63.5  3900.000
## 250    63.6  2625.000
## 251    65.0  6257.888
## 252    67.6  4149.750
## 253    69.0  2682.125
## 254    73.1  3172.500
## 255    80.0  2554.000
## 256    80.3  1117.750
## 257    81.0  2554.000
## 258    82.0  6717.111
## 259    83.0  2554.000
## 260    84.0  2554.000
## 261    85.0  2554.000
## 262    86.0  2554.000
## 263    87.0  2554.000
## 264    91.3  1758.875
## 265    96.5  3821.375
## 266   100.0  6144.257
## 267   102.4  6444.750
## 268   105.0  8162.371
## 269   110.0  5976.109
## 270   117.4  6337.375
## 271   124.0  4629.648
boxplot(RoomRent~Airport,data=hotel.df, main="Room rent vs. Distance from Airport",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

boxplot(RoomRent~Airport,data=RoomRent1.df, main="Room rent vs. Distance from Airport",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

Free Wifi

wifi<- table(hotel.df$FreeWifi)
barplot(wifi, main="Effect of Free Wifi", col="blue")

aggregate(hotel.df$RoomRent,by=list(hotel.df$FreeWifi),mean)
##   Group.1        x
## 1       0 5380.004
## 2       1 5481.518
boxplot(RoomRent~FreeWifi,data=hotel.df, main="Room rent vs. Free Wifi",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

boxplot(RoomRent~FreeWifi,data=RoomRent1.df, main="Room rent vs. Free Wifi",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

From the above box plot we can see that rooms having free wifi have a higher rent than those which don’t have free Wifi.

Free Breakfast

breakfast<- table(hotel.df$FreeBreakfast)
barplot(breakfast, main="Effect of FreeBreakfast", col="blue")

aggregate(hotel.df$RoomRent,by=list(hotel.df$FreeBreakfast),mean)
##   Group.1        x
## 1       0 5573.790
## 2       1 5420.044
boxplot(RoomRent~FreeBreakfast,data=hotel.df, main="Room rent vs. Free Breakfast",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

boxplot(RoomRent~FreeBreakfast,data=RoomRent1.df, main="Room rent vs. Free Breakfast",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

The effect of Free Breakfast depends on the outliers.

Hypothesis

Articulating hypothesis and conducting t-test to determine their p value

Hypothesis

1.Average RoomRent in hotels having swimming pool is more than that which don’t have.

t.test(RoomRent~HasSwimmingPool,data = hotel.df)
## 
##  Welch Two Sample t-test
## 
## data:  RoomRent by HasSwimmingPool
## t = -29.013, df = 5011.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5096.030 -4450.942
## sample estimates:
## mean in group 0 mean in group 1 
##        3775.566        8549.052

Since the p-value is less than 0.05, we can reject the null hypothesis that the mean are equal

2.Average RoomRent in hotels with high star rating is high as compared to one which has less star rating.

t.test(hotel.df$RoomRent,hotel.df$StarRating)
## 
##  Welch Two Sample t-test
## 
## data:  hotel.df$RoomRent and hotel.df$StarRating
## t = 85.813, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5345.575 5595.491
## sample estimates:
##   mean of x   mean of y 
## 5473.991838    3.458933

Since the p-value is less than 0.05, we can reject the null hypothesis that they are equal

3.Average RoomRent in hotels with more capacity is more than hotels with lesser capacity.

t.test(hotel.df$RoomRent,hotel.df$HotelCapacity)
## 
##  Welch Two Sample t-test
## 
## data:  hotel.df$RoomRent and hotel.df$HotelCapacity
## t = 84.882, df = 13234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5286.515 5536.445
## sample estimates:
##  mean of x  mean of y 
## 5473.99184   62.51164

Since the p-value is less than 0.05, we can reject the null hypothesis that the mean are equal

4.Average RoomRent in hotels providing Free Breakfast is more than that which don’t provide.

t.test(RoomRent~FreeBreakfast, data = hotel.df)
## 
##  Welch Two Sample t-test
## 
## data:  RoomRent by FreeBreakfast
## t = 0.98095, df = 6212.3, p-value = 0.3267
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -153.5017  460.9935
## sample estimates:
## mean in group 0 mean in group 1 
##        5573.790        5420.044

Since the p-value is more than 0.05, we fail to reject the null hypothesis that they are equal

5.Average RoomRent in metro city hotels is more than that of non metro city hotel.

t.test(RoomRent~IsMetroCity, data = hotel.df)
## 
##  Welch Two Sample t-test
## 
## data:  RoomRent by IsMetroCity
## t = 10.721, df = 13224, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   888.0308 1285.4102
## sample estimates:
## mean in group 0 mean in group 1 
##        5782.794        4696.073

REGRESSION MODELS

Generating a multiple linear regression model for RoomRent

1

fit1<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity, data = hotel.df)
summary(fit1)
## 
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity, 
##     data = hotel.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10804  -2295   -946   1002 310110 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -6896.154    340.549  -20.25   <2e-16 ***
## StarRating       3597.322    111.670   32.21   <2e-16 ***
## HasSwimmingPool  2528.885    157.894   16.02   <2e-16 ***
## HotelCapacity     -15.558      1.006  -15.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6710 on 13228 degrees of freedom
## Multiple R-squared:  0.1628, Adjusted R-squared:  0.1626 
## F-statistic: 857.5 on 3 and 13228 DF,  p-value: < 2.2e-16

2

fit2<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity+IsWeekend +IsTouristDestination, data = hotel.df)
summary(fit2)
## 
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity + 
##     IsWeekend + IsTouristDestination, data = hotel.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11233  -2380   -722   1083 309657 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -8396.675    359.825 -23.335   <2e-16 ***
## StarRating            3635.819    110.800  32.814   <2e-16 ***
## HasSwimmingPool       2285.132    157.488  14.510   <2e-16 ***
## HotelCapacity          -13.965      1.004 -13.915   <2e-16 ***
## IsWeekend               71.583    119.413   0.599    0.549    
## IsTouristDestination  1878.944    127.266  14.764   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6656 on 13226 degrees of freedom
## Multiple R-squared:  0.1764, Adjusted R-squared:  0.1761 
## F-statistic: 566.5 on 5 and 13226 DF,  p-value: < 2.2e-16

3

fit3<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity+Airport, data = hotel.df)
summary(fit3)
## 
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity + 
##     Airport, data = hotel.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10785  -2265   -876    982 310437 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -7288.048    341.691 -21.329   <2e-16 ***
## StarRating       3522.990    111.531  31.588   <2e-16 ***
## HasSwimmingPool  2708.400    158.397  17.099   <2e-16 ***
## HotelCapacity     -14.776      1.006 -14.695   <2e-16 ***
## Airport            25.344      2.590   9.786   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6687 on 13227 degrees of freedom
## Multiple R-squared:  0.1688, Adjusted R-squared:  0.1686 
## F-statistic: 671.7 on 4 and 13227 DF,  p-value: < 2.2e-16
fit4<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity+IsTouristDestination+IsNewYearEve+FreeWifi+FreeBreakfast-1, data = hotel.df)
summary(fit4)
## 
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity + 
##     IsTouristDestination + IsNewYearEve + FreeWifi + FreeBreakfast - 
##     1, data = hotel.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8993  -2447  -1104    564 311854 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## StarRating            1769.5026    69.5855  25.429  < 2e-16 ***
## HasSwimmingPool       3344.1545   152.0961  21.987  < 2e-16 ***
## HotelCapacity           -8.0571     0.9852  -8.178 3.15e-16 ***
## IsTouristDestination  1134.4975   124.6961   9.098  < 2e-16 ***
## IsNewYearEve           645.4803   177.9118   3.628 0.000287 ***
## FreeWifi             -2043.9090   202.1663 -10.110  < 2e-16 ***
## FreeBreakfast         -210.2330   124.7687  -1.685 0.092016 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6764 on 13225 degrees of freedom
## Multiple R-squared:  0.454,  Adjusted R-squared:  0.4537 
## F-statistic:  1571 on 7 and 13225 DF,  p-value: < 2.2e-16