Reading the dataset and describing the dimensions

    cities <- read.csv(paste("Cities42.csv", sep=""))
    View(cities)
    dim(cities)
## [1] 13232    19

Describing essesntial statistics

   library(psych)
   describe(cities)[,c(1:5)]
##                      vars     n       mean         sd  median
## CityName*               1 13232      18.07      11.72      16
## Population              2 13232 4416836.87 4258386.00 3046163
## CityRank                3 13232      14.83      13.51       9
## IsMetroCity             4 13232       0.28       0.45       0
## IsTouristDestination    5 13232       0.70       0.46       1
## IsWeekend               6 13232       0.62       0.48       1
## IsNewYearEve            7 13232       0.12       0.33       0
## Date*                   8 13232      14.30       2.69      14
## HotelName*              9 13232     841.19     488.16     827
## RoomRent               10 13232    5473.99    7333.12    4000
## StarRating             11 13232       3.46       0.76       3
## Airport                12 13232      21.16      22.76      15
## HotelAddress*          13 13232    1202.53     582.17    1261
## HotelPincode           14 13232  397430.26  259837.50  395003
## HotelDescription*      15 13224     581.34     363.26     567
## FreeWifi               16 13232       0.93       0.26       1
## FreeBreakfast          17 13232       0.65       0.48       1
## HotelCapacity          18 13232      62.51      76.66      34
## HasSwimmingPool        19 13232       0.36       0.48       0

Variable Analysis

One way contingency tables

    mytable <- with(cities,table(CityName))
    mytable
## CityName
##             Agra        Ahmedabad         Amritsar        Bangalore 
##              432              424              136              656 
##      Bhubaneswar       Chandigarh          Chennai       Darjeeling 
##              120              336              416              136 
##            Delhi          Gangtok              Goa         Guwahati 
##             2048              128              624               48 
##         Haridwar        Hyderabad           Indore           Jaipur 
##               48              536              160              768 
##        Jaisalmer          Jodhpur           Kanpur            Kochi 
##              264              224               16              608 
##          Kolkata          Lucknow          Madurai           Manali 
##              512              128              112              288 
##        Mangalore           Mumbai           Munnar           Mysore 
##              104              712              328              160 
##         Nainital             Ooty        Panchkula             Pune 
##              144              136               64              600 
##             Puri           Rajkot        Rishikesh           Shimla 
##               56              128               88              280 
##         Srinagar            Surat Thiruvanthipuram         Thrissur 
##               40               80              392               32 
##          Udaipur         Varanasi 
##              456              264
     mytable1 <- with(cities,table(FreeWifi))
     mytable1
## FreeWifi
##     0     1 
##   981 12251
     mytable2 <- with(cities,table(FreeBreakfast))
     mytable2
## FreeBreakfast
##    0    1 
## 4643 8589
     mytable3 <-with(cities,table(HasSwimmingPool))
     mytable3
## HasSwimmingPool
##    0    1 
## 8524 4708

Two Way Contingency tables

    mytable4<-xtabs(~StarRating+FreeWifi,data=cities)
    mytable4
##           FreeWifi
## StarRating    0    1
##        0      0   16
##        1      0    8
##        2     80  360
##        2.5  104  528
##        3    336 5617
##        3.2    0    8
##        3.3    0   16
##        3.4    0    8
##        3.5   96 1656
##        3.6    0    8
##        3.7    0   24
##        3.8    0   16
##        3.9    0   32
##        4    231 2232
##        4.1    0   24
##        4.3    0   16
##        4.4    0    8
##        4.5   24  352
##        4.7    0    8
##        4.8    0   16
##        5    110 1298
    mytable5<-xtabs(~StarRating+FreeBreakfast,data=cities)
    mytable5
##           FreeBreakfast
## StarRating    0    1
##        0     16    0
##        1      0    8
##        2    216  224
##        2.5  296  336
##        3   1789 4164
##        3.2    0    8
##        3.3    8    8
##        3.4    0    8
##        3.5  661 1091
##        3.6    8    0
##        3.7    0   24
##        3.8    8    8
##        3.9   16   16
##        4    783 1680
##        4.1    0   24
##        4.3   16    0
##        4.4    0    8
##        4.5  224  152
##        4.7    8    0
##        4.8    0   16
##        5    594  814

Boxplots

par(mfrow=c(1,3))
boxplot(cities$StarRating,beside=TRUE,ylab="Star Rating",col="pink")
boxplot(cities$Airport,beside=TRUE,ylab="Distance to airport",col="pink")
boxplot(cities$HotelCapacity,beside=TRUE,ylab="Hotel Capacity",col="pink")

Histograms and Plots of Variables

    hist(cities$Population,col="blue",main="Population",xlab="population")

     cities$IsMetroCity=factor(cities$IsMetroCity, levels=c(0,1), labels=c("No","Yes"))
    plot(cities$IsMetroCity,col="green",main="Metro City")

      hist(cities$StarRating,col="blue",main="Star Rating of Hotels",xlab="Star Rating")

     cities$FreeWifi=factor(cities$FreeWifi, levels=c(0,1), labels=c("No","Yes"))
    plot(cities$FreeWifi,col="green",main="Has Wifi?")

     cities$FreeBreakfast=factor(cities$FreeBreakfast, levels=c(0,1), labels=c("No","Yes"))
    plot(cities$FreeBreakfast,col="green",main="Has Free Breakfast?")

     cities$HasSwimmingPool=factor(cities$HasSwimmingPool, levels=c(0,1), labels=c("No","Yes"))
    plot(cities$HasSwimmingPool,col="green",main="Swimming pool?")

    hist(cities$HotelCapacity,col="blue",main="Hotel Capacities",breaks=20,xlab="capacity")

Correlation Matrix

    x<-cities[,c("Population", "RoomRent","StarRating","Airport","HotelCapacity")]
    
    y<-cities[,c("Population", "RoomRent","StarRating","Airport","HotelCapacity")]
    cor(x,y,method="pearson")
##                Population    RoomRent  StarRating     Airport
## Population     1.00000000 -0.08872806  0.13413659 -0.25970102
## RoomRent      -0.08872806  1.00000000  0.36937343  0.04965324
## StarRating     0.13413659  0.36937343  1.00000000 -0.06091918
## Airport       -0.25970102  0.04965324 -0.06091918  1.00000000
## HotelCapacity  0.25998305  0.15787331  0.63743034 -0.11767207
##               HotelCapacity
## Population        0.2599831
## RoomRent          0.1578733
## StarRating        0.6374303
## Airport          -0.1176721
## HotelCapacity     1.0000000

Corrgram

    library(corrgram)
corrgram(cities, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Hotel Pricing")

Scatter Plots

      library(car)    
scatterplot(RoomRent~StarRating,     data=cities,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Star Rating vs Room rent",
            ylab="Room Rent",
            xlab="Star Rating")

      library(car)    
scatterplot(HotelCapacity~StarRating,     data=cities,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Star Rating vs Hotel Capacity",
            ylab="Room Rent",
            xlab="Hotel Capacity")

Variance - Covariance Matrix

cc <-cbind(cities[,c(10,11,12,18)])
cov(cc)
##                   RoomRent   StarRating     Airport HotelCapacity
## RoomRent      53774601.806 2048.3754792 8287.178584   88753.41284
## StarRating        2048.375    0.5718875   -1.048528      36.95522
## Airport           8287.179   -1.0485276  518.013328    -205.32017
## HotelCapacity    88753.413   36.9552206 -205.320172    5877.26810

Relation between Pricing and various independent variables

    cor.test(cities$RoomRent,cities$StarRating) 
## 
##  Pearson's product-moment correlation
## 
## data:  cities$RoomRent and cities$StarRating
## t = 45.719, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3545660 0.3839956
## sample estimates:
##       cor 
## 0.3693734

Since p value is 2.2e-16 we can reject the null hypothesis that the correlation between pricing and star rating is not 0.

cor.test(cities$RoomRent,cities$Airport)
## 
##  Pearson's product-moment correlation
## 
## data:  cities$RoomRent and cities$Airport
## t = 5.7183, df = 13230, p-value = 1.099e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03264192 0.06663581
## sample estimates:
##        cor 
## 0.04965324

Since p value is 1.099e-08 we can reject the null hypothesis that the correlation between pricing and distance to airport is not 0.

cor.test(cities$RoomRent,cities$HotelCapacity)
## 
##  Pearson's product-moment correlation
## 
## data:  cities$RoomRent and cities$HotelCapacity
## t = 18.389, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1412142 0.1744430
## sample estimates:
##       cor 
## 0.1578733

Since p value is less than 2.2e-16 we can reject the null hypothesis that the correlation between pricing and hotel capacity is not 0.

T-test

Let the null hypothesis be that the RoomRent depends on the other factors(Star rating, Airport distance, Hotel Capcity)

   t.test(cities$RoomRent,cities$StarRating)
## 
##  Welch Two Sample t-test
## 
## data:  cities$RoomRent and cities$StarRating
## t = 85.813, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5345.575 5595.491
## sample estimates:
##   mean of x   mean of y 
## 5473.991838    3.458933

Since p value lower than 0.05 we can reject the null hypothesis

t.test(cities$RoomRent,cities$Airport)
## 
##  Welch Two Sample t-test
## 
## data:  cities$RoomRent and cities$Airport
## t = 85.535, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5327.875 5577.792
## sample estimates:
##  mean of x  mean of y 
## 5473.99184   21.15874

Since p value lower than 0.05 we can reject the null hypothesis

  t.test(cities$RoomRent,cities$HotelCapacity)
## 
##  Welch Two Sample t-test
## 
## data:  cities$RoomRent and cities$HotelCapacity
## t = 84.882, df = 13234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5286.515 5536.445
## sample estimates:
##  mean of x  mean of y 
## 5473.99184   62.51164

Since p value lower than 0.05 we can reject the null hypothesis