setwd("C:/Users/Dell/Downloads/Sameer Mathur")
hotel.df<- read.csv("Cities42.csv")
View(hotel.df)
hotel.df$Date<-gsub("18-Dec-16", "Dec 18 2016", hotel.df$Date)
hotel.df$Date<-gsub("21-Dec-16", "Dec 21 2016", hotel.df$Date)
hotel.df$Date<-gsub("24-Dec-16", "Dec 24 2016", hotel.df$Date)
hotel.df$Date<-gsub("25-Dec-16", "Dec 25 2016", hotel.df$Date)
hotel.df$Date<-gsub("28-Dec-16", "Dec 28 2016", hotel.df$Date)
hotel.df$Date<-gsub("31-Dec-16", "Dec 31 2016", hotel.df$Date)
hotel.df$Date<-gsub("04-Jan-17", "Jan 04 2017", hotel.df$Date)
hotel.df$Date<-gsub("04-Jan-16", "Jan 04 2017", hotel.df$Date)
hotel.df$Date<-gsub("08-Jan-16", "Jan 08 2017", hotel.df$Date)
hotel.df$Date<-gsub("08-Jan-17", "Jan 08 2017", hotel.df$Date)
hotel.df$Date<-gsub("Jan 4 2017", "Jan 04 2017", hotel.df$Date)
hotel.df$Date<-gsub("Jan 8 2017", "Jan 08 2017", hotel.df$Date)
table(hotel.df$Date)
##
## Dec 18 2016 Dec 21 2016 Dec 24 2016 Dec 25 2016 Dec 28 2016 Dec 31 2016
## 1652 1655 1655 1655 1655 1655
## Jan 04 2017 Jan 08 2017
## 1652 1653
hotel.df$Date<-factor(hotel.df$Date)
is.factor(hotel.df$Date)
## [1] TRUE
levels(hotel.df$Date)
## [1] "Dec 18 2016" "Dec 21 2016" "Dec 24 2016" "Dec 25 2016" "Dec 28 2016"
## [6] "Dec 31 2016" "Jan 04 2017" "Jan 08 2017"
summary(hotel.df)
## CityName Population CityRank IsMetroCity
## Delhi :2048 Min. : 8096 Min. : 0.00 Min. :0.0000
## Jaipur : 768 1st Qu.: 744983 1st Qu.: 2.00 1st Qu.:0.0000
## Mumbai : 712 Median : 3046163 Median : 9.00 Median :0.0000
## Bangalore: 656 Mean : 4416837 Mean :14.83 Mean :0.2842
## Goa : 624 3rd Qu.: 8443675 3rd Qu.:24.00 3rd Qu.:1.0000
## Kochi : 608 Max. :12442373 Max. :44.00 Max. :1.0000
## (Other) :7816
## IsTouristDestination IsWeekend IsNewYearEve Date
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Dec 21 2016:1655
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 Dec 24 2016:1655
## Median :1.0000 Median :1.0000 Median :0.0000 Dec 25 2016:1655
## Mean :0.6972 Mean :0.6228 Mean :0.1244 Dec 28 2016:1655
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 Dec 31 2016:1655
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Jan 08 2017:1653
## (Other) :3304
## HotelName RoomRent StarRating
## Vivanta by Taj : 32 Min. : 299 Min. :0.000
## Goldfinch Hotel : 24 1st Qu.: 2436 1st Qu.:3.000
## OYO Rooms : 24 Median : 4000 Median :3.000
## The Gordon House Hotel: 24 Mean : 5474 Mean :3.459
## Apnayt Villa : 16 3rd Qu.: 6299 3rd Qu.:4.000
## Bentleys Hotel Colaba : 16 Max. :322500 Max. :5.000
## (Other) :13096
## Airport
## Min. : 0.20
## 1st Qu.: 8.40
## Median : 15.00
## Mean : 21.16
## 3rd Qu.: 24.00
## Max. :124.00
##
## HotelAddress
## The Mall, Shimla : 32
## #2-91/14/8, White Fields, Kondapur, Hitech City, Hyderabad, 500084 India: 16
## 121, City Terrace, Walchand Hirachand Marg, Mumbai, Maharashtra : 16
## 14-4507/9, Balmatta Road, Near Jyothi Circle, Hampankatta : 16
## 144/7, Rajiv Gandi Salai (OMR), Kottivakkam, Chennai, Tamil Nadu : 16
## 17, Oliver Road, Colaba, Mumbai, Maharashtra : 16
## (Other) :13120
## HotelPincode HotelDescription FreeWifi FreeBreakfast
## Min. : 100025 3 : 120 Min. :0.0000 Min. :0.0000
## 1st Qu.: 221001 Abc : 112 1st Qu.:1.0000 1st Qu.:0.0000
## Median : 395003 3-star hotel: 104 Median :1.0000 Median :1.0000
## Mean : 397430 3.5 : 88 Mean :0.9259 Mean :0.6491
## 3rd Qu.: 570001 4 : 72 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :7000157 (Other) :12728 Max. :1.0000 Max. :1.0000
## NA's : 8
## HotelCapacity HasSwimmingPool
## Min. : 0.00 Min. :0.0000
## 1st Qu.: 16.00 1st Qu.:0.0000
## Median : 34.00 Median :0.0000
## Mean : 62.51 Mean :0.3558
## 3rd Qu.: 75.00 3rd Qu.:1.0000
## Max. :600.00 Max. :1.0000
##
library(psych)
describe(hotel.df)
## vars n mean sd median trimmed
## CityName* 1 13232 18.07 11.72 16 17.29
## Population 2 13232 4416836.87 4258386.00 3046163 4040816.22
## CityRank 3 13232 14.83 13.51 9 13.30
## IsMetroCity 4 13232 0.28 0.45 0 0.23
## IsTouristDestination 5 13232 0.70 0.46 1 0.75
## IsWeekend 6 13232 0.62 0.48 1 0.65
## IsNewYearEve 7 13232 0.12 0.33 0 0.03
## Date* 8 13232 4.50 2.29 4 4.50
## HotelName* 9 13232 841.19 488.16 827 841.18
## RoomRent 10 13232 5473.99 7333.12 4000 4383.33
## StarRating 11 13232 3.46 0.76 3 3.40
## Airport 12 13232 21.16 22.76 15 16.39
## HotelAddress* 13 13232 1202.53 582.17 1261 1233.25
## HotelPincode 14 13232 397430.26 259837.50 395003 388540.47
## HotelDescription* 15 13224 581.34 363.26 567 575.37
## FreeWifi 16 13232 0.93 0.26 1 1.00
## FreeBreakfast 17 13232 0.65 0.48 1 0.69
## HotelCapacity 18 13232 62.51 76.66 34 46.03
## HasSwimmingPool 19 13232 0.36 0.48 0 0.32
## mad min max range skew
## CityName* 11.86 1.0 42 41.0 0.48
## Population 3846498.95 8096.0 12442373 12434277.0 0.68
## CityRank 11.86 0.0 44 44.0 0.69
## IsMetroCity 0.00 0.0 1 1.0 0.96
## IsTouristDestination 0.00 0.0 1 1.0 -0.86
## IsWeekend 0.00 0.0 1 1.0 -0.51
## IsNewYearEve 0.00 0.0 1 1.0 2.28
## Date* 2.97 1.0 8 7.0 0.00
## HotelName* 641.97 1.0 1670 1669.0 0.01
## RoomRent 2653.85 299.0 322500 322201.0 16.75
## StarRating 0.74 0.0 5 5.0 0.48
## Airport 11.12 0.2 124 123.8 2.73
## HotelAddress* 668.65 1.0 2108 2107.0 -0.37
## HotelPincode 257975.37 100025.0 7000157 6900132.0 9.99
## HotelDescription* 472.95 1.0 1226 1225.0 0.11
## FreeWifi 0.00 0.0 1 1.0 -3.25
## FreeBreakfast 0.00 0.0 1 1.0 -0.62
## HotelCapacity 28.17 0.0 600 600.0 2.95
## HasSwimmingPool 0.00 0.0 1 1.0 0.60
## kurtosis se
## CityName* -0.88 0.10
## Population -1.08 37019.65
## CityRank -0.76 0.12
## IsMetroCity -1.08 0.00
## IsTouristDestination -1.26 0.00
## IsWeekend -1.74 0.00
## IsNewYearEve 3.18 0.00
## Date* -1.24 0.02
## HotelName* -1.25 4.24
## RoomRent 582.06 63.75
## StarRating 0.25 0.01
## Airport 7.89 0.20
## HotelAddress* -0.88 5.06
## HotelPincode 249.76 2258.86
## HotelDescription* -1.25 3.16
## FreeWifi 8.57 0.00
## FreeBreakfast -1.61 0.00
## HotelCapacity 11.39 0.67
## HasSwimmingPool -1.64 0.00
Dependent Variable: Room rent Trying to find the independent variable which influence the dependent variable the most with the help of a corrgram
library(corrgram)
corrgram(hotel.df,order=TRUE, lower.panel = panel.shade,upper.panel = panel.pie, text.panel = panel.txt,main="Corrgram of all the factors affecting Hotel Room Pricing")
From the above corrgram we can see that the three factors affecting the room rent the most are the Star Rating, Hotel capacity, Swimming pool facility in the hotel.
table(hotel.df$HasSwimmingPool)
##
## 0 1
## 8524 4708
Swim<-table(hotel.df$HasSwimmingPool)
barplot(Swim,main="Barrplot of Hotel Swimming Pool")
From the bar plot we can see that there are lesser number of hotels having a swimming pool than the ones having one.
table(hotel.df$StarRating)
##
## 0 1 2 2.5 3 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1
## 16 8 440 632 5953 8 16 8 1752 8 24 16 32 2463 24
## 4.3 4.4 4.5 4.7 4.8 5
## 16 8 376 8 16 1408
starRating<-table(hotel.df$StarRating)
barplot(starRating,main = "Barrplot for Star Rating")
From the bar plot we can see that maximum number of hotels have a 3-star rating.
boxplot(hotel.df$HotelCapacity, main="Boxplot for Hotel Capacity",horizontal = TRUE)
plot(hotel.df$StarRating,hotel.df$RoomRent,main="RoomRent of Hotels with StarRating",ylab = "RoomRent in INR", xlab="Star rating out of 5",cex=1.0)
plot(hotel.df$RoomRent,hotel.df$HotelCapacity,main="RoomRent of Hotels with Hotel capacity",ylab = "Hotel Capacity in rooms", xlab="RoomRent in INR",cex=1.0)
plot(hotel.df$RoomRent,hotel.df$HasSwimmingPool,main="RoomRent of Hotels with HasSwimmingPool",ylab = "Has Swimmng Pool ", xlab="RoomRent",cex=1.0)
xy<- data.frame(hotel.df$RoomRent,hotel.df$StarRating,hotel.df$HasSwimmingPool,hotel.df$HotelCapacity)
corrgram(xy, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie , text.panel = panel.txt)
x<- hotel.df[,c("StarRating","HotelCapacity","HasSwimmingPool")]
y<- hotel.df[,c("RoomRent")]
cor(x,y)
## [,1]
## StarRating 0.3693734
## HotelCapacity 0.1578733
## HasSwimmingPool 0.3116577
cov(x,y)
## [,1]
## StarRating 2048.375
## HotelCapacity 88753.413
## HasSwimmingPool 1094.202
RoomRent1.df <-hotel.df[which(hotel.df$RoomRent<100000),]
metro<- table(hotel.df$IsMetroCity)
barplot(metro, main="Distribution of IsMetroCity", xlab="Not a Metro city(0) Is a Metro City(1)", col="blue")
aggregate(hotel.df$RoomRent,by=list(hotel.df$IsMetroCity),mean)
## Group.1 x
## 1 0 5782.794
## 2 1 4696.073
boxplot(RoomRent~IsMetroCity,data=hotel.df, main="Room rent vs. IsMetroCity", ylab="Not a Metro city(0) or Metro City(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)
boxplot(RoomRent~IsMetroCity,data=RoomRent1.df, main="Room rent vs. IsMetroCity", ylab="Not a Metro city(0) or Metro City(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)
tourist<- table(hotel.df$IsTouristDestination)
barplot(tourist, main="Distribution of IsTouristDestination", xlab="Not a Tourist Destination(0) Is a Tourist Destination(1)", col="blue")
aggregate(hotel.df$RoomRent,by=list(hotel.df$IsTouristDestination),mean)
## Group.1 x
## 1 0 4111.003
## 2 1 6066.024
boxplot(RoomRent~IsTouristDestination,data=hotel.df, main="Room rent vs. IsTouristDestination", ylab="Not a Tourist Destination(0) Is a Tourist Destination(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)
boxplot(RoomRent~IsTouristDestination,data=RoomRent1.df, main="Room rent vs. IsTouristDestination", ylab="Not a Tourist Destination(0) Is a Tourist Destination(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)
From the above boxplot we can see that the rent of a room in a tourist place is more than that in a non tourist place.
weekend<- table(hotel.df$IsWeekend)
barplot(weekend, main="Distribution of IsWeekend", xlab="Not a Weekend(0) Is a Weekend(1)", col="blue")
aggregate(hotel.df$RoomRent,by=list(hotel.df$IsWeekend),mean)
## Group.1 x
## 1 0 5430.835
## 2 1 5500.129
boxplot(RoomRent~IsWeekend,data=hotel.df, main="Room rent vs. IsWeekend", ylab="Not a Weekend(0) Is a Weekend(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)
boxplot(RoomRent~IsWeekend,data=RoomRent1.df, main="Room rent vs. IsWeekend", ylab="Not a Weekend(0) Is a Weekend(1)", xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"),horizontal=TRUE)
date<- table(hotel.df$Date)
barplot(date, main="Distribution of Dates", col="blue")
d=aggregate(hotel.df$RoomRent,by=list(hotel.df$Date),mean)
d
## Group.1 x
## 1 Dec 18 2016 4896.402
## 2 Dec 21 2016 5085.315
## 3 Dec 24 2016 5543.236
## 4 Dec 25 2016 5464.143
## 5 Dec 28 2016 5593.924
## 6 Dec 31 2016 6191.776
## 7 Jan 04 2017 5674.062
## 8 Jan 08 2017 5342.234
boxplot(RoomRent~Date,data=hotel.df, main="Room rent vs. Dates",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))
boxplot(RoomRent~Date,data=RoomRent1.df, main="Room rent vs. Dates",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))
From the above boxplots we can see that the room rents are the highest on 31st December.
airport<- table(hotel.df$Airport)
barplot(airport, main="Distribution of Distance from Airport", col="blue")
aggregate(hotel.df$RoomRent,by=list(hotel.df$Airport),mean)
## Group.1 x
## 1 0.2 5247.000
## 2 0.3 2217.500
## 3 0.4 4987.025
## 4 0.5 2437.458
## 5 0.6 4805.781
## 6 0.7 2894.750
## 7 0.8 4687.500
## 8 0.9 4356.128
## 9 1.0 4452.812
## 10 1.1 43487.500
## 11 1.2 5154.050
## 12 1.4 15562.500
## 13 1.5 7273.250
## 14 1.6 10460.469
## 15 1.7 4968.091
## 16 1.8 3298.681
## 17 1.9 6440.400
## 18 2.0 5804.821
## 19 2.1 3400.975
## 20 2.2 4448.000
## 21 2.3 8030.938
## 22 2.4 5419.656
## 23 2.5 5863.000
## 24 2.6 3027.396
## 25 2.7 4659.054
## 26 2.8 4563.417
## 27 2.9 3667.232
## 28 3.0 4636.929
## 29 3.1 5859.875
## 30 3.2 7848.667
## 31 3.3 8960.125
## 32 3.4 6088.854
## 33 3.5 4267.839
## 34 3.6 6699.828
## 35 3.7 2603.125
## 36 3.8 3145.825
## 37 3.9 4075.250
## 38 4.0 4013.944
## 39 4.1 4218.688
## 40 4.2 3857.075
## 41 4.3 4388.656
## 42 4.4 5525.062
## 43 4.5 6135.208
## 44 4.6 5061.025
## 45 4.7 2378.125
## 46 4.8 3889.250
## 47 4.9 3421.219
## 48 5.0 6144.041
## 49 5.1 6547.000
## 50 5.2 4390.306
## 51 5.3 4119.250
## 52 5.4 4705.900
## 53 5.5 4371.625
## 54 5.6 5430.900
## 55 5.7 6046.594
## 56 5.8 5625.607
## 57 5.9 5016.650
## 58 6.0 3875.970
## 59 6.1 5853.375
## 60 6.2 2701.828
## 61 6.3 1705.125
## 62 6.4 3212.479
## 63 6.5 4871.208
## 64 6.6 3623.625
## 65 6.7 2994.292
## 66 6.8 4132.357
## 67 6.9 2749.950
## 68 7.0 4028.469
## 69 7.1 2366.806
## 70 7.2 3611.604
## 71 7.3 2522.042
## 72 7.4 3571.975
## 73 7.5 4816.667
## 74 7.6 4420.141
## 75 7.7 27828.708
## 76 7.8 5656.594
## 77 7.9 2730.958
## 78 8.0 9879.685
## 79 8.1 5120.639
## 80 8.2 3684.607
## 81 8.3 5117.825
## 82 8.4 4164.208
## 83 8.5 2991.688
## 84 8.6 1378.938
## 85 8.7 4457.446
## 86 8.8 3498.562
## 87 8.9 2627.250
## 88 9.0 3618.694
## 89 9.1 5122.458
## 90 9.2 9520.790
## 91 9.3 4147.375
## 92 9.4 5233.500
## 93 9.5 8224.909
## 94 9.6 3226.050
## 95 9.7 4376.083
## 96 9.8 4045.625
## 97 9.9 7947.732
## 98 10.0 5184.302
## 99 10.2 2780.000
## 100 10.3 2587.000
## 101 10.4 2346.000
## 102 10.6 1574.375
## 103 10.7 7025.000
## 104 10.8 12157.875
## 105 10.9 1949.812
## 106 11.0 4665.262
## 107 11.1 2867.250
## 108 11.3 1948.812
## 109 11.7 4069.000
## 110 11.9 7264.938
## 111 12.0 5014.164
## 112 12.2 3113.458
## 113 12.3 1746.750
## 114 12.6 4241.000
## 115 12.7 4566.750
## 116 13.0 6872.332
## 117 13.1 2525.000
## 118 13.3 4881.250
## 119 13.5 1831.250
## 120 13.6 4371.333
## 121 13.7 5908.812
## 122 13.8 2507.500
## 123 14.0 3632.243
## 124 14.2 1801.000
## 125 14.4 4002.167
## 126 14.5 3847.500
## 127 14.6 6711.438
## 128 14.7 5431.167
## 129 14.8 7086.625
## 130 14.9 4631.250
## 131 15.0 4804.245
## 132 15.3 2983.875
## 133 15.4 5179.125
## 134 15.6 4233.375
## 135 15.7 3385.250
## 136 15.8 5960.500
## 137 15.9 9961.875
## 138 16.0 5052.724
## 139 16.1 10451.000
## 140 16.2 4637.250
## 141 16.4 2404.250
## 142 16.5 4639.250
## 143 16.7 6648.281
## 144 17.0 5245.613
## 145 17.1 3251.000
## 146 17.2 4874.500
## 147 17.4 1911.750
## 148 17.5 16538.125
## 149 17.6 6273.000
## 150 17.8 4139.438
## 151 18.0 5023.542
## 152 18.3 6125.000
## 153 18.5 3543.250
## 154 18.6 6693.750
## 155 18.7 2782.625
## 156 19.0 10216.920
## 157 19.5 2262.500
## 158 19.9 7232.500
## 159 20.0 5474.096
## 160 20.2 8412.500
## 161 20.3 3930.812
## 162 20.5 2169.625
## 163 20.9 6281.750
## 164 21.0 4546.419
## 165 21.4 6944.500
## 166 21.5 3882.750
## 167 22.0 4453.590
## 168 22.1 5305.000
## 169 22.2 3235.000
## 170 22.4 3887.500
## 171 22.5 6103.250
## 172 23.0 5019.740
## 173 23.2 10887.500
## 174 23.3 5088.000
## 175 23.4 4942.375
## 176 24.0 3863.335
## 177 24.2 38115.625
## 178 24.3 16894.500
## 179 24.5 5305.750
## 180 24.6 45274.375
## 181 24.7 2078.000
## 182 24.9 20867.438
## 183 25.0 5229.457
## 184 25.6 7140.625
## 185 25.7 6137.500
## 186 25.9 15937.500
## 187 26.0 6258.703
## 188 26.1 26156.250
## 189 26.3 2369.250
## 190 26.4 7483.000
## 191 26.5 6112.500
## 192 26.7 7992.500
## 193 27.0 5835.206
## 194 27.1 23437.500
## 195 27.2 4832.000
## 196 28.0 3282.277
## 197 28.1 7140.625
## 198 28.6 7518.750
## 199 28.7 3781.625
## 200 29.0 3602.364
## 201 30.0 5784.393
## 202 30.5 20500.000
## 203 31.0 4943.406
## 204 31.2 6193.750
## 205 31.3 9125.000
## 206 31.9 4204.750
## 207 32.0 5803.528
## 208 32.9 7936.875
## 209 33.0 3026.100
## 210 33.4 6292.000
## 211 34.0 5784.875
## 212 35.0 8111.898
## 213 36.0 7528.882
## 214 36.2 6871.500
## 215 37.0 8712.878
## 216 38.0 6006.755
## 217 38.3 8117.875
## 218 39.0 4524.650
## 219 39.9 2206.500
## 220 40.0 5576.768
## 221 41.0 5355.676
## 222 42.0 3292.293
## 223 42.7 4118.750
## 224 43.0 7559.758
## 225 43.9 9247.500
## 226 44.0 5925.000
## 227 44.5 4233.125
## 228 44.6 7147.000
## 229 44.8 33033.500
## 230 46.0 4236.850
## 231 47.0 7256.000
## 232 47.5 19108.125
## 233 48.0 4268.750
## 234 48.4 3000.000
## 235 49.0 18237.500
## 236 50.0 5681.875
## 237 50.1 2360.875
## 238 50.5 3417.750
## 239 51.0 3178.250
## 240 52.0 4198.375
## 241 52.7 7820.000
## 242 53.0 4062.500
## 243 55.0 18950.000
## 244 57.2 15375.000
## 245 60.0 2846.000
## 246 61.0 14319.062
## 247 62.0 5412.719
## 248 63.0 8687.500
## 249 63.5 3900.000
## 250 63.6 2625.000
## 251 65.0 6257.888
## 252 67.6 4149.750
## 253 69.0 2682.125
## 254 73.1 3172.500
## 255 80.0 2554.000
## 256 80.3 1117.750
## 257 81.0 2554.000
## 258 82.0 6717.111
## 259 83.0 2554.000
## 260 84.0 2554.000
## 261 85.0 2554.000
## 262 86.0 2554.000
## 263 87.0 2554.000
## 264 91.3 1758.875
## 265 96.5 3821.375
## 266 100.0 6144.257
## 267 102.4 6444.750
## 268 105.0 8162.371
## 269 110.0 5976.109
## 270 117.4 6337.375
## 271 124.0 4629.648
boxplot(RoomRent~Airport,data=hotel.df, main="Room rent vs. Distance from Airport",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))
boxplot(RoomRent~Airport,data=RoomRent1.df, main="Room rent vs. Distance from Airport",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))
wifi<- table(hotel.df$FreeWifi)
barplot(wifi, main="Effect of Free Wifi", col="blue")
aggregate(hotel.df$RoomRent,by=list(hotel.df$FreeWifi),mean)
## Group.1 x
## 1 0 5380.004
## 2 1 5481.518
boxplot(RoomRent~FreeWifi,data=hotel.df, main="Room rent vs. Free Wifi",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))
boxplot(RoomRent~FreeWifi,data=RoomRent1.df, main="Room rent vs. Free Wifi",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))
From the above box plot we can see that rooms having free wifi have a higher rent than those which don’t have free Wifi.
breakfast<- table(hotel.df$FreeBreakfast)
barplot(breakfast, main="Effect of FreeBreakfast", col="blue")
aggregate(hotel.df$RoomRent,by=list(hotel.df$FreeBreakfast),mean)
## Group.1 x
## 1 0 5573.790
## 2 1 5420.044
boxplot(RoomRent~FreeBreakfast,data=hotel.df, main="Room rent vs. Free Breakfast",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))
boxplot(RoomRent~FreeBreakfast,data=RoomRent1.df, main="Room rent vs. Free Breakfast",xlab="Room Rent in rupees ", col=c("red","blue","green","yellow"))
The effect of Free Breakfast depends on the outliers.
t.test(RoomRent~HasSwimmingPool,data = hotel.df)
##
## Welch Two Sample t-test
##
## data: RoomRent by HasSwimmingPool
## t = -29.013, df = 5011.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5096.030 -4450.942
## sample estimates:
## mean in group 0 mean in group 1
## 3775.566 8549.052
Since the p-value is less than 0.05, we can reject the null hypothesis that the mean are equal
t.test(hotel.df$RoomRent,hotel.df$StarRating)
##
## Welch Two Sample t-test
##
## data: hotel.df$RoomRent and hotel.df$StarRating
## t = 85.813, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5345.575 5595.491
## sample estimates:
## mean of x mean of y
## 5473.991838 3.458933
Since the p-value is less than 0.05, we can reject the null hypothesis that they are equal
t.test(hotel.df$RoomRent,hotel.df$HotelCapacity)
##
## Welch Two Sample t-test
##
## data: hotel.df$RoomRent and hotel.df$HotelCapacity
## t = 84.882, df = 13234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5286.515 5536.445
## sample estimates:
## mean of x mean of y
## 5473.99184 62.51164
Since the p-value is less than 0.05, we can reject the null hypothesis that the mean are equal
t.test(RoomRent~FreeBreakfast, data = hotel.df)
##
## Welch Two Sample t-test
##
## data: RoomRent by FreeBreakfast
## t = 0.98095, df = 6212.3, p-value = 0.3267
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -153.5017 460.9935
## sample estimates:
## mean in group 0 mean in group 1
## 5573.790 5420.044
Since the p-value is more than 0.05, we fail to reject the null hypothesis that they are equal
t.test(RoomRent~IsMetroCity, data = hotel.df)
##
## Welch Two Sample t-test
##
## data: RoomRent by IsMetroCity
## t = 10.721, df = 13224, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 888.0308 1285.4102
## sample estimates:
## mean in group 0 mean in group 1
## 5782.794 4696.073
fit1<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity, data = hotel.df)
summary(fit1)
##
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity,
## data = hotel.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10804 -2295 -946 1002 310110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6896.154 340.549 -20.25 <2e-16 ***
## StarRating 3597.322 111.670 32.21 <2e-16 ***
## HasSwimmingPool 2528.885 157.894 16.02 <2e-16 ***
## HotelCapacity -15.558 1.006 -15.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6710 on 13228 degrees of freedom
## Multiple R-squared: 0.1628, Adjusted R-squared: 0.1626
## F-statistic: 857.5 on 3 and 13228 DF, p-value: < 2.2e-16
fit2<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity+IsWeekend +IsTouristDestination, data = hotel.df)
summary(fit2)
##
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity +
## IsWeekend + IsTouristDestination, data = hotel.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11233 -2380 -722 1083 309657
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8396.675 359.825 -23.335 <2e-16 ***
## StarRating 3635.819 110.800 32.814 <2e-16 ***
## HasSwimmingPool 2285.132 157.488 14.510 <2e-16 ***
## HotelCapacity -13.965 1.004 -13.915 <2e-16 ***
## IsWeekend 71.583 119.413 0.599 0.549
## IsTouristDestination 1878.944 127.266 14.764 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6656 on 13226 degrees of freedom
## Multiple R-squared: 0.1764, Adjusted R-squared: 0.1761
## F-statistic: 566.5 on 5 and 13226 DF, p-value: < 2.2e-16
fit3<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity+Airport, data = hotel.df)
summary(fit3)
##
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity +
## Airport, data = hotel.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10785 -2265 -876 982 310437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7288.048 341.691 -21.329 <2e-16 ***
## StarRating 3522.990 111.531 31.588 <2e-16 ***
## HasSwimmingPool 2708.400 158.397 17.099 <2e-16 ***
## HotelCapacity -14.776 1.006 -14.695 <2e-16 ***
## Airport 25.344 2.590 9.786 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6687 on 13227 degrees of freedom
## Multiple R-squared: 0.1688, Adjusted R-squared: 0.1686
## F-statistic: 671.7 on 4 and 13227 DF, p-value: < 2.2e-16
fit4<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity+IsTouristDestination+IsNewYearEve+FreeWifi+FreeBreakfast-1, data = hotel.df)
summary(fit4)
##
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity +
## IsTouristDestination + IsNewYearEve + FreeWifi + FreeBreakfast -
## 1, data = hotel.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8993 -2447 -1104 564 311854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## StarRating 1769.5026 69.5855 25.429 < 2e-16 ***
## HasSwimmingPool 3344.1545 152.0961 21.987 < 2e-16 ***
## HotelCapacity -8.0571 0.9852 -8.178 3.15e-16 ***
## IsTouristDestination 1134.4975 124.6961 9.098 < 2e-16 ***
## IsNewYearEve 645.4803 177.9118 3.628 0.000287 ***
## FreeWifi -2043.9090 202.1663 -10.110 < 2e-16 ***
## FreeBreakfast -210.2330 124.7687 -1.685 0.092016 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6764 on 13225 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4537
## F-statistic: 1571 on 7 and 13225 DF, p-value: < 2.2e-16