Summarizing the variables of the data set
library(psych)
summary(city)
## X CityName Population CityRank
## Min. : 1 Delhi :2048 Min. : 8096 Min. : 0.00
## 1st Qu.: 3309 Jaipur : 768 1st Qu.: 744983 1st Qu.: 2.00
## Median : 6616 Mumbai : 712 Median : 3046163 Median : 9.00
## Mean : 6616 Bangalore: 656 Mean : 4416837 Mean :14.83
## 3rd Qu.: 9924 Goa : 624 3rd Qu.: 8443675 3rd Qu.:24.00
## Max. :13232 Kochi : 608 Max. :12442373 Max. :44.00
## (Other) :7816
## IsMetroCity IsTouristDestination IsWeekend IsNewYearEve
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.2842 Mean :0.6972 Mean :0.6228 Mean :0.1244
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## Date HotelName RoomRent
## Dec 21 2016:1611 Vivanta by Taj : 32 Min. : 299
## Dec 24 2016:1611 Goldfinch Hotel : 24 1st Qu.: 2436
## Dec 25 2016:1611 OYO Rooms : 24 Median : 4000
## Dec 28 2016:1611 The Gordon House Hotel: 24 Mean : 5474
## Dec 31 2016:1611 Apnayt Villa : 16 3rd Qu.: 6299
## Dec 18 2016:1608 Bentleys Hotel Colaba : 16 Max. :322500
## (Other) :3569 (Other) :13096
## StarRating Airport
## Min. :0.000 Min. : 0.20
## 1st Qu.:3.000 1st Qu.: 8.40
## Median :3.000 Median : 15.00
## Mean :3.459 Mean : 21.16
## 3rd Qu.:4.000 3rd Qu.: 24.00
## Max. :5.000 Max. :124.00
##
## HotelAddress
## The Mall, Shimla : 32
## #2-91/14/8, White Fields, Kondapur, Hitech City, Hyderabad, 500084 India: 16
## 121, City Terrace, Walchand Hirachand Marg, Mumbai, Maharashtra : 16
## 14-4507/9, Balmatta Road, Near Jyothi Circle, Hampankatta : 16
## 144/7, Rajiv Gandi Salai (OMR), Kottivakkam, Chennai, Tamil Nadu : 16
## 17, Oliver Road, Colaba, Mumbai, Maharashtra : 16
## (Other) :13120
## HotelPincode HotelDescription FreeWifi FreeBreakfast
## Min. : 100025 3 : 120 Min. :0.0000 Min. :0.0000
## 1st Qu.: 221001 Abc : 112 1st Qu.:1.0000 1st Qu.:0.0000
## Median : 395003 3-star hotel: 104 Median :1.0000 Median :1.0000
## Mean : 397430 3.5 : 88 Mean :0.9259 Mean :0.6491
## 3rd Qu.: 570001 4 : 72 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :7000157 (Other) :12728 Max. :1.0000 Max. :1.0000
## NA's : 8
## HotelCapacity HasSwimmingPool
## Min. : 0.00 Min. :0.0000
## 1st Qu.: 16.00 1st Qu.:0.0000
## Median : 34.00 Median :0.0000
## Mean : 62.51 Mean :0.3558
## 3rd Qu.: 75.00 3rd Qu.:1.0000
## Max. :600.00 Max. :1.0000
##
describe(city)
## vars n mean sd median trimmed
## X 1 13232 6616.50 3819.89 6616.5 6616.50
## CityName* 2 13232 18.07 11.72 16.0 17.29
## Population 3 13232 4416836.87 4258386.00 3046163.0 4040816.22
## CityRank 4 13232 14.83 13.51 9.0 13.30
## IsMetroCity 5 13232 0.28 0.45 0.0 0.23
## IsTouristDestination 6 13232 0.70 0.46 1.0 0.75
## IsWeekend 7 13232 0.62 0.48 1.0 0.65
## IsNewYearEve 8 13232 0.12 0.33 0.0 0.03
## Date* 9 13232 14.26 2.82 14.0 14.39
## HotelName* 10 13232 841.84 488.14 834.0 842.05
## RoomRent 11 13232 5473.99 7333.12 4000.0 4383.33
## StarRating 12 13232 3.46 0.76 3.0 3.40
## Airport 13 13232 21.16 22.76 15.0 16.39
## HotelAddress* 14 13232 1202.69 581.98 1261.0 1233.43
## HotelPincode 15 13232 397430.26 259837.50 395003.0 388540.47
## HotelDescription* 16 13224 581.40 363.01 570.0 575.79
## FreeWifi 17 13232 0.93 0.26 1.0 1.00
## FreeBreakfast 18 13232 0.65 0.48 1.0 0.69
## HotelCapacity 19 13232 62.51 76.66 34.0 46.03
## HasSwimmingPool 20 13232 0.36 0.48 0.0 0.32
## mad min max range skew
## X 4904.44 1.0 13232 13231.0 0.00
## CityName* 11.86 1.0 42 41.0 0.48
## Population 3846498.95 8096.0 12442373 12434277.0 0.68
## CityRank 11.86 0.0 44 44.0 0.69
## IsMetroCity 0.00 0.0 1 1.0 0.96
## IsTouristDestination 0.00 0.0 1 1.0 -0.86
## IsWeekend 0.00 0.0 1 1.0 -0.51
## IsNewYearEve 0.00 0.0 1 1.0 2.28
## Date* 2.97 1.0 20 19.0 -1.05
## HotelName* 644.93 1.0 1670 1669.0 0.00
## RoomRent 2653.85 299.0 322500 322201.0 16.75
## StarRating 0.74 0.0 5 5.0 0.48
## Airport 11.12 0.2 124 123.8 2.73
## HotelAddress* 668.65 1.0 2108 2107.0 -0.37
## HotelPincode 257975.37 100025.0 7000157 6900132.0 9.99
## HotelDescription* 465.54 1.0 1226 1225.0 0.10
## FreeWifi 0.00 0.0 1 1.0 -3.25
## FreeBreakfast 0.00 0.0 1 1.0 -0.62
## HotelCapacity 28.17 0.0 600 600.0 2.95
## HasSwimmingPool 0.00 0.0 1 1.0 0.60
## kurtosis se
## X -1.20 33.21
## CityName* -0.88 0.10
## Population -1.08 37019.65
## CityRank -0.76 0.12
## IsMetroCity -1.08 0.00
## IsTouristDestination -1.26 0.00
## IsWeekend -1.74 0.00
## IsNewYearEve 3.18 0.00
## Date* 2.93 0.02
## HotelName* -1.26 4.24
## RoomRent 582.06 63.75
## StarRating 0.25 0.01
## Airport 7.89 0.20
## HotelAddress* -0.88 5.06
## HotelPincode 249.76 2258.86
## HotelDescription* -1.25 3.16
## FreeWifi 8.57 0.00
## FreeBreakfast -1.61 0.00
## HotelCapacity 11.39 0.67
## HasSwimmingPool -1.64 0.00
#BoxPlot for HotelCapacity
boxplot(city$HotelCapacity, main="Boxplot for Hotel Capacity",horizontal = TRUE)

#Scatterplot pair wise for predictor variable
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
#StarRating Vs RoomRent
scatterplot(city$StarRating,city$RoomRent,main="RoomRent of Hotels with StarRating",ylab = "RoomRent in INR", xlab="Star rating out of 5",cex=1.1)

#RoomRent Vs HotelCapacity
scatterplot(city$RoomRent,city$HotelCapacity,main="RoomRent of Hotels with Hotel capacity",ylab = "Hotel Capacity in rooms", xlab="RoomRent in INR",cex=1.1)

#RoomRent Vs HasSwimmingPool
plot(jitter(city$RoomRent),jitter(city$HasSwimmingPool),main="RoomRent of Hotels with HasSwimmingPool",ylab = "Has Swimmng Pool ", xlab="RoomRent",cex=1.1)

library(lattice)
bwplot(HasSwimmingPool~RoomRent, data = city,main="RoomRent of Hotels with HasSwimmingPool",ylab = "Has Swimmng Pool ", xlab="RoomRent" )

#Scatterplot matrix
scatterplotMatrix(
city[
,c("RoomRent","HasSwimmingPool","StarRating", "HotelCapacity")],
spread=FALSE, smoother.args=list(lty=2),
main="Scatter Plot Matrix", diagonal = "histogram")
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

#Corrgram of Y, x1, x2, x3
library(corrgram)
xyz<-data.frame(city$RoomRent, city$HasSwimmingPool, city$HotelCapacity, city$StarRating)
corrgram(xyz, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of Hotel Prices In India")

library(corrgram)
corrgram(city, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of Hotel data")

##through corrgram HasSwimming, StarRating, HotelCapital are very well correlated to RoomRent
##so we can take them as predictors
#Comapring RoomRent on different dates
table(city$Date)
##
## 18-Dec-16 21-Dec-16 24-Dec-16 25-Dec-16 28-Dec-16 31-Dec-16
## 44 44 44 44 44 44
## 4-Jan-16 4-Jan-17 8-Jan-16 8-Jan-17 Dec 18 2016 Dec 21 2016
## 31 13 31 13 1608 1611
## Dec 24 2016 Dec 25 2016 Dec 28 2016 Dec 31 2016 Jan 04 2017 Jan 08 2017
## 1611 1611 1611 1611 1548 1542
## Jan 4 2017 Jan 8 2017
## 60 67
library(lattice)
histogram(~Date, data = city, main="Distribution of Dates", xlab = "Differnt of Dates", col="Blue")

#Effect of different dates on RoomRent
d = aggregate(RoomRent ~ Date, data = city,mean)
d
## Date RoomRent
## 1 18-Dec-16 3366.795
## 2 21-Dec-16 3437.545
## 3 24-Dec-16 3510.795
## 4 25-Dec-16 3349.591
## 5 28-Dec-16 3450.045
## 6 31-Dec-16 3570.318
## 7 4-Jan-16 4738.548
## 8 4-Jan-17 3829.615
## 9 8-Jan-16 4907.419
## 10 8-Jan-17 3843.077
## 11 Dec 18 2016 4938.257
## 12 Dec 21 2016 5130.320
## 13 Dec 24 2016 5598.746
## 14 Dec 25 2016 5521.896
## 15 Dec 28 2016 5652.478
## 16 Dec 31 2016 6263.374
## 17 Jan 04 2017 5754.513
## 18 Jan 08 2017 5406.821
## 19 Jan 4 2017 4481.400
## 20 Jan 8 2017 4347.821
scatterplot(d$Date,d$RoomRent, main="Scatterplot between Date and RoomRent", xlab="Date", ylab = "Room Rent in Rupees")

boxplot(RoomRent~Date,data=city, main="Room rent vs. Date", xlab="Different Dates", ylab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

#Analyzing IsWeekeng effect on RoomRent
table(city$IsWeekend)
##
## 0 1
## 4991 8241
table1<-table(city$IsWeekend)
barplot(table1, main="Distribution of Weekend", xlab="Not weekend(0) Weekend(1)", col="orange")

#Comapring RoomRent on different dates
table(city$Date)
##
## 18-Dec-16 21-Dec-16 24-Dec-16 25-Dec-16 28-Dec-16 31-Dec-16
## 44 44 44 44 44 44
## 4-Jan-16 4-Jan-17 8-Jan-16 8-Jan-17 Dec 18 2016 Dec 21 2016
## 31 13 31 13 1608 1611
## Dec 24 2016 Dec 25 2016 Dec 28 2016 Dec 31 2016 Jan 04 2017 Jan 08 2017
## 1611 1611 1611 1611 1548 1542
## Jan 4 2017 Jan 8 2017
## 60 67
library(lattice)
histogram(~Date, data = city, main="Distribution of Dates", xlab = "Differnt of Dates", col="Blue")

#Analyzing Airport distance from hotel effects in what way on RoomRent
summary(city$Airport)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.20 8.40 15.00 21.16 24.00 124.00
boxplot(city$Airport, main="Boxplot of Airport",xlab= "Distance of airport from hotel(Km)" ,col="green",horizontal = TRUE)

#1.Average RoomRent in hotels having swimming pool is more than that which don't have.
t.test(RoomRent~HasSwimmingPool,data = city, alternative="less")
##
## Welch Two Sample t-test
##
## data: RoomRent by HasSwimmingPool
## t = -29.013, df = 5011.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -4502.814
## sample estimates:
## mean in group 0 mean in group 1
## 3775.566 8549.052
#2.Average RoomRent in hotels with high star rating is high as compared to one which has less star rating.
t.test(city$RoomRent,city$StarRating)
##
## Welch Two Sample t-test
##
## data: city$RoomRent and city$StarRating
## t = 85.813, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5345.575 5595.491
## sample estimates:
## mean of x mean of y
## 5473.991838 3.458933
#3.Average RoomRent in hotels having more hotel capacity is more compared to one with less capacity.
t.test(city$RoomRent,city$HotelCapacity)
##
## Welch Two Sample t-test
##
## data: city$RoomRent and city$HotelCapacity
## t = 84.882, df = 13234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5286.515 5536.445
## sample estimates:
## mean of x mean of y
## 5473.99184 62.51164
#Generating a multiple linear regression model for RoomRent
#1.
fit1<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity-1, data = city)
summary(fit1)
##
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity -
## 1, data = city)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8039 -2448 -1249 461 312401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## StarRating 1396.8746 26.1320 53.455 < 2e-16 ***
## HasSwimmingPool 3719.6943 148.7835 25.001 < 2e-16 ***
## HotelCapacity -7.6598 0.9415 -8.136 4.44e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6813 on 13229 degrees of freedom
## Multiple R-squared: 0.4457, Adjusted R-squared: 0.4456
## F-statistic: 3546 on 3 and 13229 DF, p-value: < 2.2e-16
#Coefficents of the model
fit1$coefficients
## StarRating HasSwimmingPool HotelCapacity
## 1396.874562 3719.694300 -7.659814
#Fitted residuals and values are checked and the deviation was around 1000 , because of
#large data points it's not suitable to show those in the output file.
###. Model1: salary = b0 + b1*StarRating + b2*HasSwimmingPool+ b3*HotelCapacity
# b0 = -1(assumption), b1 = 1396.874562, b2=3719.6943, b3= -7.659814
# Model: salary = -1 + 1396.874562*StarRating + 3719.6943*HasSwimmingPool -7.659814*HotelCapacity
#2.
fit2<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity+IsWeekend+IsTouristDestination-1, data = city)
summary(fit2)
##
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity +
## IsWeekend + IsTouristDestination - 1, data = city)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8326 -2517 -1212 463 312480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## StarRating 1258.9558 44.4985 28.292 < 2e-16 ***
## HasSwimmingPool 3670.2511 148.8411 24.659 < 2e-16 ***
## HotelCapacity -6.1769 0.9658 -6.396 1.65e-10 ***
## IsWeekend -509.6479 119.1618 -4.277 1.91e-05 ***
## IsTouristDestination 1053.0394 124.7325 8.442 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6792 on 13227 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4491
## F-statistic: 2159 on 5 and 13227 DF, p-value: < 2.2e-16
#Coefficents of the model
fit2$coefficients
## StarRating HasSwimmingPool HotelCapacity
## 1258.955786 3670.251057 -6.176913
## IsWeekend IsTouristDestination
## -509.647863 1053.039364
#Fitted residuals and values are checked and the deviation was around 1000 , because of
#large data points it's not suitable to show those in the output file.
###. Model1: salary = b0 + b1*StarRating + b2*HasSwimmingPool+ b3*HotelCapacity +b4*IsWeekend(0) + b5*IsWeekend(1) + b6*IsTouristDestination
# b0 = -1(assumption), b1 = 3635.819, b2=2285.132, b3= -13.965, b4=-8396.67457, b5=-8325.09152,b6=1878.94395
# Model: salary = -1 + 3635.819*StarRating + 2285.132*HasSwimmingPool -13.965*HotelCapacity
# -8396.67457*IsWeekend(0) - 8325.09152*IsWeekend(1) + 1878.94395*IsTouristDestination
knitr::opts_chunk$set(echo = TRUE)