cities <- read.csv(paste("Cities42.csv", sep=""))
View(cities)
dim(cities)
## [1] 13232 19
library(psych)
describe(cities)[,c(1:5)]
## vars n mean sd median
## CityName* 1 13232 18.07 11.72 16
## Population 2 13232 4416836.87 4258386.00 3046163
## CityRank 3 13232 14.83 13.51 9
## IsMetroCity 4 13232 0.28 0.45 0
## IsTouristDestination 5 13232 0.70 0.46 1
## IsWeekend 6 13232 0.62 0.48 1
## IsNewYearEve 7 13232 0.12 0.33 0
## Date* 8 13232 14.30 2.69 14
## HotelName* 9 13232 841.19 488.16 827
## RoomRent 10 13232 5473.99 7333.12 4000
## StarRating 11 13232 3.46 0.76 3
## Airport 12 13232 21.16 22.76 15
## HotelAddress* 13 13232 1202.53 582.17 1261
## HotelPincode 14 13232 397430.26 259837.50 395003
## HotelDescription* 15 13224 581.34 363.26 567
## FreeWifi 16 13232 0.93 0.26 1
## FreeBreakfast 17 13232 0.65 0.48 1
## HotelCapacity 18 13232 62.51 76.66 34
## HasSwimmingPool 19 13232 0.36 0.48 0
One way contingency tables
mytable <- with(cities,table(CityName))
mytable
## CityName
## Agra Ahmedabad Amritsar Bangalore
## 432 424 136 656
## Bhubaneswar Chandigarh Chennai Darjeeling
## 120 336 416 136
## Delhi Gangtok Goa Guwahati
## 2048 128 624 48
## Haridwar Hyderabad Indore Jaipur
## 48 536 160 768
## Jaisalmer Jodhpur Kanpur Kochi
## 264 224 16 608
## Kolkata Lucknow Madurai Manali
## 512 128 112 288
## Mangalore Mumbai Munnar Mysore
## 104 712 328 160
## Nainital Ooty Panchkula Pune
## 144 136 64 600
## Puri Rajkot Rishikesh Shimla
## 56 128 88 280
## Srinagar Surat Thiruvanthipuram Thrissur
## 40 80 392 32
## Udaipur Varanasi
## 456 264
mytable1 <- with(cities,table(FreeWifi))
mytable1
## FreeWifi
## 0 1
## 981 12251
mytable2 <- with(cities,table(FreeBreakfast))
mytable2
## FreeBreakfast
## 0 1
## 4643 8589
mytable3 <-with(cities,table(HasSwimmingPool))
mytable3
## HasSwimmingPool
## 0 1
## 8524 4708
Two Way Contingency tables
mytable4<-xtabs(~StarRating+FreeWifi,data=cities)
mytable4
## FreeWifi
## StarRating 0 1
## 0 0 16
## 1 0 8
## 2 80 360
## 2.5 104 528
## 3 336 5617
## 3.2 0 8
## 3.3 0 16
## 3.4 0 8
## 3.5 96 1656
## 3.6 0 8
## 3.7 0 24
## 3.8 0 16
## 3.9 0 32
## 4 231 2232
## 4.1 0 24
## 4.3 0 16
## 4.4 0 8
## 4.5 24 352
## 4.7 0 8
## 4.8 0 16
## 5 110 1298
mytable5<-xtabs(~StarRating+FreeBreakfast,data=cities)
mytable5
## FreeBreakfast
## StarRating 0 1
## 0 16 0
## 1 0 8
## 2 216 224
## 2.5 296 336
## 3 1789 4164
## 3.2 0 8
## 3.3 8 8
## 3.4 0 8
## 3.5 661 1091
## 3.6 8 0
## 3.7 0 24
## 3.8 8 8
## 3.9 16 16
## 4 783 1680
## 4.1 0 24
## 4.3 16 0
## 4.4 0 8
## 4.5 224 152
## 4.7 8 0
## 4.8 0 16
## 5 594 814
par(mfrow=c(1,3))
boxplot(cities$StarRating,beside=TRUE,ylab="Star Rating",col="pink")
boxplot(cities$Airport,beside=TRUE,ylab="Distance to airport",col="pink")
boxplot(cities$HotelCapacity,beside=TRUE,ylab="Hotel Capacity",col="pink")
hist(cities$Population,col="blue",main="Population",xlab="population")
cities$IsMetroCity=factor(cities$IsMetroCity, levels=c(0,1), labels=c("No","Yes"))
plot(cities$IsMetroCity,col="green",main="Metro City")
hist(cities$StarRating,col="blue",main="Star Rating of Hotels",xlab="Star Rating")
cities$FreeWifi=factor(cities$FreeWifi, levels=c(0,1), labels=c("No","Yes"))
plot(cities$FreeWifi,col="green",main="Has Wifi?")
cities$FreeBreakfast=factor(cities$FreeBreakfast, levels=c(0,1), labels=c("No","Yes"))
plot(cities$FreeBreakfast,col="green",main="Has Free Breakfast?")
cities$HasSwimmingPool=factor(cities$HasSwimmingPool, levels=c(0,1), labels=c("No","Yes"))
plot(cities$HasSwimmingPool,col="green",main="Swimming pool?")
hist(cities$HotelCapacity,col="blue",main="Hotel Capacities",breaks=20,xlab="capacity")
x<-cities[,c("Population", "RoomRent","StarRating","Airport","HotelCapacity")]
y<-cities[,c("Population", "RoomRent","StarRating","Airport","HotelCapacity")]
cor(x,y,method="pearson")
## Population RoomRent StarRating Airport
## Population 1.00000000 -0.08872806 0.13413659 -0.25970102
## RoomRent -0.08872806 1.00000000 0.36937343 0.04965324
## StarRating 0.13413659 0.36937343 1.00000000 -0.06091918
## Airport -0.25970102 0.04965324 -0.06091918 1.00000000
## HotelCapacity 0.25998305 0.15787331 0.63743034 -0.11767207
## HotelCapacity
## Population 0.2599831
## RoomRent 0.1578733
## StarRating 0.6374303
## Airport -0.1176721
## HotelCapacity 1.0000000
library(corrgram)
corrgram(cities, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Hotel Pricing")
library(car)
scatterplot(RoomRent~StarRating, data=cities,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of Star Rating vs Room rent",
ylab="Room Rent",
xlab="Star Rating")
library(car)
scatterplot(HotelCapacity~StarRating, data=cities,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of Star Rating vs Hotel Capacity",
ylab="Room Rent",
xlab="Hotel Capacity")
cc <-cbind(cities[,c(10,11,12,18)])
cov(cc)
## RoomRent StarRating Airport HotelCapacity
## RoomRent 53774601.806 2048.3754792 8287.178584 88753.41284
## StarRating 2048.375 0.5718875 -1.048528 36.95522
## Airport 8287.179 -1.0485276 518.013328 -205.32017
## HotelCapacity 88753.413 36.9552206 -205.320172 5877.26810
cor.test(cities$RoomRent,cities$StarRating)
##
## Pearson's product-moment correlation
##
## data: cities$RoomRent and cities$StarRating
## t = 45.719, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3545660 0.3839956
## sample estimates:
## cor
## 0.3693734
Since p value is 2.2e-16 we can reject the null hypothesis that the correlation between pricing and star rating is not 0.
cor.test(cities$RoomRent,cities$Airport)
##
## Pearson's product-moment correlation
##
## data: cities$RoomRent and cities$Airport
## t = 5.7183, df = 13230, p-value = 1.099e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03264192 0.06663581
## sample estimates:
## cor
## 0.04965324
Since p value is 1.099e-08 we can reject the null hypothesis that the correlation between pricing and distance to airport is not 0.
cor.test(cities$RoomRent,cities$HotelCapacity)
##
## Pearson's product-moment correlation
##
## data: cities$RoomRent and cities$HotelCapacity
## t = 18.389, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1412142 0.1744430
## sample estimates:
## cor
## 0.1578733
Since p value is less than 2.2e-16 we can reject the null hypothesis that the correlation between pricing and hotel capacity is not 0.
Let the null hypothesis be that the RoomRent depends on the other factors(Star rating, Airport distance, Hotel Capcity)
t.test(cities$RoomRent,cities$StarRating)
##
## Welch Two Sample t-test
##
## data: cities$RoomRent and cities$StarRating
## t = 85.813, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5345.575 5595.491
## sample estimates:
## mean of x mean of y
## 5473.991838 3.458933
Since p value lower than 0.05 we can reject the null hypothesis
t.test(cities$RoomRent,cities$Airport)
##
## Welch Two Sample t-test
##
## data: cities$RoomRent and cities$Airport
## t = 85.535, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5327.875 5577.792
## sample estimates:
## mean of x mean of y
## 5473.99184 21.15874
Since p value lower than 0.05 we can reject the null hypothesis
t.test(cities$RoomRent,cities$HotelCapacity)
##
## Welch Two Sample t-test
##
## data: cities$RoomRent and cities$HotelCapacity
## t = 84.882, df = 13234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5286.515 5536.445
## sample estimates:
## mean of x mean of y
## 5473.99184 62.51164
Since p value lower than 0.05 we can reject the null hypothesis