cities <- read.csv(paste("Cities42.csv", sep=""))
View(cities)
dim(cities)
## [1] 13232 19
library(psych)
describe(cities)[,c(1:5)]
## vars n mean sd median
## CityName* 1 13232 18.07 11.72 16
## Population 2 13232 4416836.87 4258386.00 3046163
## CityRank 3 13232 14.83 13.51 9
## IsMetroCity 4 13232 0.28 0.45 0
## IsTouristDestination 5 13232 0.70 0.46 1
## IsWeekend 6 13232 0.62 0.48 1
## IsNewYearEve 7 13232 0.12 0.33 0
## Date* 8 13232 14.30 2.69 14
## HotelName* 9 13232 841.19 488.16 827
## RoomRent 10 13232 5473.99 7333.12 4000
## StarRating 11 13232 3.46 0.76 3
## Airport 12 13232 21.16 22.76 15
## HotelAddress* 13 13232 1202.53 582.17 1261
## HotelPincode 14 13232 397430.26 259837.50 395003
## HotelDescription* 15 13224 581.34 363.26 567
## FreeWifi 16 13232 0.93 0.26 1
## FreeBreakfast 17 13232 0.65 0.48 1
## HotelCapacity 18 13232 62.51 76.66 34
## HasSwimmingPool 19 13232 0.36 0.48 0
One way contingency tables
mytable <- with(cities,table(CityName))
mytable
## CityName
## Agra Ahmedabad Amritsar Bangalore
## 432 424 136 656
## Bhubaneswar Chandigarh Chennai Darjeeling
## 120 336 416 136
## Delhi Gangtok Goa Guwahati
## 2048 128 624 48
## Haridwar Hyderabad Indore Jaipur
## 48 536 160 768
## Jaisalmer Jodhpur Kanpur Kochi
## 264 224 16 608
## Kolkata Lucknow Madurai Manali
## 512 128 112 288
## Mangalore Mumbai Munnar Mysore
## 104 712 328 160
## Nainital Ooty Panchkula Pune
## 144 136 64 600
## Puri Rajkot Rishikesh Shimla
## 56 128 88 280
## Srinagar Surat Thiruvanthipuram Thrissur
## 40 80 392 32
## Udaipur Varanasi
## 456 264
mytable1 <- with(cities,table(FreeWifi))
mytable1
## FreeWifi
## 0 1
## 981 12251
mytable2 <- with(cities,table(FreeBreakfast))
mytable2
## FreeBreakfast
## 0 1
## 4643 8589
mytable3 <-with(cities,table(HasSwimmingPool))
mytable3
## HasSwimmingPool
## 0 1
## 8524 4708
Two Way Contingency tables
mytable4<-xtabs(~StarRating+FreeWifi,data=cities)
mytable4
## FreeWifi
## StarRating 0 1
## 0 0 16
## 1 0 8
## 2 80 360
## 2.5 104 528
## 3 336 5617
## 3.2 0 8
## 3.3 0 16
## 3.4 0 8
## 3.5 96 1656
## 3.6 0 8
## 3.7 0 24
## 3.8 0 16
## 3.9 0 32
## 4 231 2232
## 4.1 0 24
## 4.3 0 16
## 4.4 0 8
## 4.5 24 352
## 4.7 0 8
## 4.8 0 16
## 5 110 1298
mytable5<-xtabs(~StarRating+FreeBreakfast,data=cities)
mytable5
## FreeBreakfast
## StarRating 0 1
## 0 16 0
## 1 0 8
## 2 216 224
## 2.5 296 336
## 3 1789 4164
## 3.2 0 8
## 3.3 8 8
## 3.4 0 8
## 3.5 661 1091
## 3.6 8 0
## 3.7 0 24
## 3.8 8 8
## 3.9 16 16
## 4 783 1680
## 4.1 0 24
## 4.3 16 0
## 4.4 0 8
## 4.5 224 152
## 4.7 8 0
## 4.8 0 16
## 5 594 814
par(mfrow=c(1,3))
boxplot(cities$StarRating,beside=TRUE,ylab="Star Rating",col="pink")
boxplot(cities$Airport,beside=TRUE,ylab="Distance to airport",col="pink")
boxplot(cities$HotelCapacity,beside=TRUE,ylab="Hotel Capacity",col="pink")
hist(cities$Population,col="blue",main="Population",xlab="population")
cities$IsMetroCity=factor(cities$IsMetroCity, levels=c(0,1), labels=c("No","Yes"))
plot(cities$IsMetroCity,col="green",main="Metro City")
hist(cities$StarRating,col="blue",main="Star Rating of Hotels",xlab="Star Rating")
cities$FreeWifi=factor(cities$FreeWifi, levels=c(0,1), labels=c("No","Yes"))
plot(cities$FreeWifi,col="green",main="Has Wifi?")
cities$FreeBreakfast=factor(cities$FreeBreakfast, levels=c(0,1), labels=c("No","Yes"))
plot(cities$FreeBreakfast,col="green",main="Has Free Breakfast?")
cities$HasSwimmingPool=factor(cities$HasSwimmingPool, levels=c(0,1), labels=c("No","Yes"))
plot(cities$HasSwimmingPool,col="green",main="Swimming pool?")
hist(cities$HotelCapacity,col="blue",main="Hotel Capacities",breaks=20,xlab="capacity")
x<-cities[,c("Population", "RoomRent","StarRating","Airport","HotelCapacity")]
y<-cities[,c("Population", "RoomRent","StarRating","Airport","HotelCapacity")]
cor(x,y,method="pearson")
## Population RoomRent StarRating Airport
## Population 1.00000000 -0.08872806 0.13413659 -0.25970102
## RoomRent -0.08872806 1.00000000 0.36937343 0.04965324
## StarRating 0.13413659 0.36937343 1.00000000 -0.06091918
## Airport -0.25970102 0.04965324 -0.06091918 1.00000000
## HotelCapacity 0.25998305 0.15787331 0.63743034 -0.11767207
## HotelCapacity
## Population 0.2599831
## RoomRent 0.1578733
## StarRating 0.6374303
## Airport -0.1176721
## HotelCapacity 1.0000000
library(corrgram)
corrgram(cities, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Hotel Pricing")
library(car)
scatterplot(RoomRent~StarRating, data=cities,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of Star Rating vs Room rent",
ylab="Room Rent",
xlab="Star Rating")
library(car)
scatterplot(HotelCapacity~StarRating, data=cities,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of Star Rating vs Hotel Capacity",
ylab="Room Rent",
xlab="Hotel Capacity")
cc <-cbind(cities[,c(10,11,12,18)])
cov(cc)
## RoomRent StarRating Airport HotelCapacity
## RoomRent 53774601.806 2048.3754792 8287.178584 88753.41284
## StarRating 2048.375 0.5718875 -1.048528 36.95522
## Airport 8287.179 -1.0485276 518.013328 -205.32017
## HotelCapacity 88753.413 36.9552206 -205.320172 5877.26810
cor.test(cities$RoomRent,cities$StarRating)
##
## Pearson's product-moment correlation
##
## data: cities$RoomRent and cities$StarRating
## t = 45.719, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3545660 0.3839956
## sample estimates:
## cor
## 0.3693734
Since p value is 2.2e-16 we can reject the null hypothesis that the correlation between pricing and star rating is not 0.
cor.test(cities$RoomRent,cities$Airport)
##
## Pearson's product-moment correlation
##
## data: cities$RoomRent and cities$Airport
## t = 5.7183, df = 13230, p-value = 1.099e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03264192 0.06663581
## sample estimates:
## cor
## 0.04965324
Since p value is 1.099e-08 we can reject the null hypothesis that the correlation between pricing and distance to airport is not 0.
cor.test(cities$RoomRent,cities$HotelCapacity)
##
## Pearson's product-moment correlation
##
## data: cities$RoomRent and cities$HotelCapacity
## t = 18.389, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1412142 0.1744430
## sample estimates:
## cor
## 0.1578733
Since p value is less than 2.2e-16 we can reject the null hypothesis that the correlation between pricing and hotel capacity is not 0.
Let the null hypothesis be that the RoomRent depends on the other factors(Star rating, Airport distance, Hotel Capcity)
t.test(cities$RoomRent,cities$StarRating)
##
## Welch Two Sample t-test
##
## data: cities$RoomRent and cities$StarRating
## t = 85.813, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5345.575 5595.491
## sample estimates:
## mean of x mean of y
## 5473.991838 3.458933
Since p value lower than 0.05 we can reject the null hypothesis
t.test(cities$RoomRent,cities$Airport)
##
## Welch Two Sample t-test
##
## data: cities$RoomRent and cities$Airport
## t = 85.535, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5327.875 5577.792
## sample estimates:
## mean of x mean of y
## 5473.99184 21.15874
Since p value lower than 0.05 we can reject the null hypothesis
t.test(cities$RoomRent,cities$HotelCapacity)
##
## Welch Two Sample t-test
##
## data: cities$RoomRent and cities$HotelCapacity
## t = 84.882, df = 13234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5286.515 5536.445
## sample estimates:
## mean of x mean of y
## 5473.99184 62.51164
Since p value lower than 0.05 we can reject the null hypothesis
Y <- Room rent X <- city, Star Rating, Airport, Hotel Capacity
cities1 <- cities[which(cities$CityName=="Mumbai"),]
View(cities1)
max(cities1$RoomRent)
## [1] 24378
Mfit1<-lm(cities1$RoomRent~cities1$StarRating+cities1$Airport+cities1$HotelCapacity)
summary(Mfit1)
##
## Call:
## lm(formula = cities1$RoomRent ~ cities1$StarRating + cities1$Airport +
## cities1$HotelCapacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7427.4 -1385.0 -222.7 1203.3 15468.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2157.039 554.740 -3.888 0.00011 ***
## cities1$StarRating 1892.941 154.068 12.286 < 2e-16 ***
## cities1$Airport 72.650 13.290 5.466 6.37e-08 ***
## cities1$HotelCapacity 7.349 0.893 8.230 8.97e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2192 on 708 degrees of freedom
## Multiple R-squared: 0.5083, Adjusted R-squared: 0.5062
## F-statistic: 244 on 3 and 708 DF, p-value: < 2.2e-16
cities2 <-cities[which(cities$CityName=="Hyderabad"),]
View(cities2)
max(cities2$RoomRent)
## [1] 40000
Mfit2<-lm(cities2$RoomRent~cities2$StarRating+cities2$Airport+cities2$HotelCapacity)
summary(Mfit2)
##
## Call:
## lm(formula = cities2$RoomRent ~ cities2$StarRating + cities2$Airport +
## cities2$HotelCapacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5210.5 -2192.8 -479.6 1301.5 22809.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -131.246 1863.540 -0.070 0.943879
## cities2$StarRating 4703.886 392.269 11.991 < 2e-16 ***
## cities2$Airport -541.129 55.738 -9.708 < 2e-16 ***
## cities2$HotelCapacity -13.105 3.562 -3.679 0.000258 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3549 on 532 degrees of freedom
## Multiple R-squared: 0.4518, Adjusted R-squared: 0.4487
## F-statistic: 146.2 on 3 and 532 DF, p-value: < 2.2e-16
cities3 <-cities[which(cities$CityName=="Udaipur"),]
View(cities3)
max(cities3$RoomRent)
## [1] 52000
Mfit3<-lm(cities3$RoomRent~cities3$StarRating+cities3$Airport+cities3$HotelCapacity)
summary(Mfit3)
##
## Call:
## lm(formula = cities3$RoomRent ~ cities3$StarRating + cities3$Airport +
## cities3$HotelCapacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21581 -4136 -1269 1969 28239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21863.64 2254.36 -9.698 < 2e-16 ***
## cities3$StarRating 10250.02 647.72 15.825 < 2e-16 ***
## cities3$Airport -176.12 69.37 -2.539 0.0115 *
## cities3$HotelCapacity 49.69 11.02 4.510 8.26e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8350 on 452 degrees of freedom
## Multiple R-squared: 0.4944, Adjusted R-squared: 0.491
## F-statistic: 147.3 on 3 and 452 DF, p-value: < 2.2e-16
cities4 <-cities[which(cities$CityName=="Delhi"),]
View(cities4)
max(cities4$RoomRent)
## [1] 45000
Mfit4<-lm(cities4$RoomRent~cities4$StarRating+cities4$Airport+cities4$HotelCapacity)
summary(Mfit4)
##
## Call:
## lm(formula = cities4$RoomRent ~ cities4$StarRating + cities4$Airport +
## cities4$HotelCapacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4480 -1115 -396 698 35735
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6782.7047 336.3556 -20.165 < 2e-16 ***
## cities4$StarRating 2970.9032 98.2369 30.242 < 2e-16 ***
## cities4$Airport 31.7254 8.5299 3.719 0.000205 ***
## cities4$HotelCapacity 0.2545 0.7127 0.357 0.721111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2460 on 2044 degrees of freedom
## Multiple R-squared: 0.4843, Adjusted R-squared: 0.4835
## F-statistic: 639.8 on 3 and 2044 DF, p-value: < 2.2e-16
cities5 <-cities[which(cities$CityName=="Manali"),]
View(cities5)
max(cities5$RoomRent)
## [1] 12134
Mfit5<-lm(cities5$RoomRent~cities5$StarRating+cities5$Airport+cities5$HotelCapacity)
summary(Mfit5)
##
## Call:
## lm(formula = cities5$RoomRent ~ cities5$StarRating + cities5$Airport +
## cities5$HotelCapacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4533.6 -1492.6 -474.4 1484.9 7315.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2430.915 1137.246 2.138 0.0334 *
## cities5$StarRating 1655.426 245.705 6.737 8.95e-11 ***
## cities5$Airport -81.959 17.271 -4.745 3.30e-06 ***
## cities5$HotelCapacity 5.710 5.999 0.952 0.3420
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2183 on 284 degrees of freedom
## Multiple R-squared: 0.2399, Adjusted R-squared: 0.2318
## F-statistic: 29.87 on 3 and 284 DF, p-value: < 2.2e-16
cities6 <-cities[which(cities$CityName=="Goa"),]
View(cities6)
max(cities6$RoomRent)
## [1] 41999
Mfit6<-lm(cities6$RoomRent~cities6$StarRating+cities6$Airport+cities6$HotelCapacity)
summary(Mfit6)
##
## Call:
## lm(formula = cities6$RoomRent ~ cities6$StarRating + cities6$Airport +
## cities6$HotelCapacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8732 -2963 -911 1564 26945
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4531.695 966.849 -4.687 3.41e-06 ***
## cities6$StarRating 3572.736 272.225 13.124 < 2e-16 ***
## cities6$Airport 26.351 17.261 1.527 0.127
## cities6$HotelCapacity 7.759 5.155 1.505 0.133
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4993 on 620 degrees of freedom
## Multiple R-squared: 0.3378, Adjusted R-squared: 0.3346
## F-statistic: 105.4 on 3 and 620 DF, p-value: < 2.2e-16
cities7 <-cities[which(cities$CityName=="Bangalore"),]
View(cities7)
max(cities7$RoomRent)
## [1] 13500
Mfit7<-lm(cities7$RoomRent~cities7$StarRating+cities7$Airport+cities7$HotelCapacity)
summary(Mfit7)
##
## Call:
## lm(formula = cities7$RoomRent ~ cities7$StarRating + cities7$Airport +
## cities7$HotelCapacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3015.6 -1015.6 -263.5 716.7 7137.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -811.303 573.174 -1.415 0.157412
## cities7$StarRating 1453.173 135.721 10.707 < 2e-16 ***
## cities7$Airport -40.013 10.679 -3.747 0.000195 ***
## cities7$HotelCapacity 8.444 1.193 7.076 3.84e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1608 on 652 degrees of freedom
## Multiple R-squared: 0.542, Adjusted R-squared: 0.5399
## F-statistic: 257.2 on 3 and 652 DF, p-value: < 2.2e-16