Reading the dataset and describing the dimensions

    cities <- read.csv(paste("Cities42.csv", sep=""))
    View(cities)
    dim(cities)
## [1] 13232    19

Describing essesntial statistics

   library(psych)
   describe(cities)[,c(1:5)]
##                      vars     n       mean         sd  median
## CityName*               1 13232      18.07      11.72      16
## Population              2 13232 4416836.87 4258386.00 3046163
## CityRank                3 13232      14.83      13.51       9
## IsMetroCity             4 13232       0.28       0.45       0
## IsTouristDestination    5 13232       0.70       0.46       1
## IsWeekend               6 13232       0.62       0.48       1
## IsNewYearEve            7 13232       0.12       0.33       0
## Date*                   8 13232      14.30       2.69      14
## HotelName*              9 13232     841.19     488.16     827
## RoomRent               10 13232    5473.99    7333.12    4000
## StarRating             11 13232       3.46       0.76       3
## Airport                12 13232      21.16      22.76      15
## HotelAddress*          13 13232    1202.53     582.17    1261
## HotelPincode           14 13232  397430.26  259837.50  395003
## HotelDescription*      15 13224     581.34     363.26     567
## FreeWifi               16 13232       0.93       0.26       1
## FreeBreakfast          17 13232       0.65       0.48       1
## HotelCapacity          18 13232      62.51      76.66      34
## HasSwimmingPool        19 13232       0.36       0.48       0

Variable Analysis

One way contingency tables

    mytable <- with(cities,table(CityName))
    mytable
## CityName
##             Agra        Ahmedabad         Amritsar        Bangalore 
##              432              424              136              656 
##      Bhubaneswar       Chandigarh          Chennai       Darjeeling 
##              120              336              416              136 
##            Delhi          Gangtok              Goa         Guwahati 
##             2048              128              624               48 
##         Haridwar        Hyderabad           Indore           Jaipur 
##               48              536              160              768 
##        Jaisalmer          Jodhpur           Kanpur            Kochi 
##              264              224               16              608 
##          Kolkata          Lucknow          Madurai           Manali 
##              512              128              112              288 
##        Mangalore           Mumbai           Munnar           Mysore 
##              104              712              328              160 
##         Nainital             Ooty        Panchkula             Pune 
##              144              136               64              600 
##             Puri           Rajkot        Rishikesh           Shimla 
##               56              128               88              280 
##         Srinagar            Surat Thiruvanthipuram         Thrissur 
##               40               80              392               32 
##          Udaipur         Varanasi 
##              456              264
     mytable1 <- with(cities,table(FreeWifi))
     mytable1
## FreeWifi
##     0     1 
##   981 12251
     mytable2 <- with(cities,table(FreeBreakfast))
     mytable2
## FreeBreakfast
##    0    1 
## 4643 8589
     mytable3 <-with(cities,table(HasSwimmingPool))
     mytable3
## HasSwimmingPool
##    0    1 
## 8524 4708

Two Way Contingency tables

    mytable4<-xtabs(~StarRating+FreeWifi,data=cities)
    mytable4
##           FreeWifi
## StarRating    0    1
##        0      0   16
##        1      0    8
##        2     80  360
##        2.5  104  528
##        3    336 5617
##        3.2    0    8
##        3.3    0   16
##        3.4    0    8
##        3.5   96 1656
##        3.6    0    8
##        3.7    0   24
##        3.8    0   16
##        3.9    0   32
##        4    231 2232
##        4.1    0   24
##        4.3    0   16
##        4.4    0    8
##        4.5   24  352
##        4.7    0    8
##        4.8    0   16
##        5    110 1298
    mytable5<-xtabs(~StarRating+FreeBreakfast,data=cities)
    mytable5
##           FreeBreakfast
## StarRating    0    1
##        0     16    0
##        1      0    8
##        2    216  224
##        2.5  296  336
##        3   1789 4164
##        3.2    0    8
##        3.3    8    8
##        3.4    0    8
##        3.5  661 1091
##        3.6    8    0
##        3.7    0   24
##        3.8    8    8
##        3.9   16   16
##        4    783 1680
##        4.1    0   24
##        4.3   16    0
##        4.4    0    8
##        4.5  224  152
##        4.7    8    0
##        4.8    0   16
##        5    594  814

Boxplots

par(mfrow=c(1,3))
boxplot(cities$StarRating,beside=TRUE,ylab="Star Rating",col="pink")
boxplot(cities$Airport,beside=TRUE,ylab="Distance to airport",col="pink")
boxplot(cities$HotelCapacity,beside=TRUE,ylab="Hotel Capacity",col="pink")

Histograms and Plots of Variables

    hist(cities$Population,col="blue",main="Population",xlab="population")

     cities$IsMetroCity=factor(cities$IsMetroCity, levels=c(0,1), labels=c("No","Yes"))
    plot(cities$IsMetroCity,col="green",main="Metro City")

      hist(cities$StarRating,col="blue",main="Star Rating of Hotels",xlab="Star Rating")

     cities$FreeWifi=factor(cities$FreeWifi, levels=c(0,1), labels=c("No","Yes"))
    plot(cities$FreeWifi,col="green",main="Has Wifi?")

     cities$FreeBreakfast=factor(cities$FreeBreakfast, levels=c(0,1), labels=c("No","Yes"))
    plot(cities$FreeBreakfast,col="green",main="Has Free Breakfast?")

     cities$HasSwimmingPool=factor(cities$HasSwimmingPool, levels=c(0,1), labels=c("No","Yes"))
    plot(cities$HasSwimmingPool,col="green",main="Swimming pool?")

    hist(cities$HotelCapacity,col="blue",main="Hotel Capacities",breaks=20,xlab="capacity")

Correlation Matrix

    x<-cities[,c("Population", "RoomRent","StarRating","Airport","HotelCapacity")]
    
    y<-cities[,c("Population", "RoomRent","StarRating","Airport","HotelCapacity")]
    cor(x,y,method="pearson")
##                Population    RoomRent  StarRating     Airport
## Population     1.00000000 -0.08872806  0.13413659 -0.25970102
## RoomRent      -0.08872806  1.00000000  0.36937343  0.04965324
## StarRating     0.13413659  0.36937343  1.00000000 -0.06091918
## Airport       -0.25970102  0.04965324 -0.06091918  1.00000000
## HotelCapacity  0.25998305  0.15787331  0.63743034 -0.11767207
##               HotelCapacity
## Population        0.2599831
## RoomRent          0.1578733
## StarRating        0.6374303
## Airport          -0.1176721
## HotelCapacity     1.0000000

Corrgram

    library(corrgram)
corrgram(cities, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Hotel Pricing")

Scatter Plots

      library(car)    
scatterplot(RoomRent~StarRating,     data=cities,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Star Rating vs Room rent",
            ylab="Room Rent",
            xlab="Star Rating")

      library(car)    
scatterplot(HotelCapacity~StarRating,     data=cities,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Star Rating vs Hotel Capacity",
            ylab="Room Rent",
            xlab="Hotel Capacity")

Variance - Covariance Matrix

cc <-cbind(cities[,c(10,11,12,18)])
cov(cc)
##                   RoomRent   StarRating     Airport HotelCapacity
## RoomRent      53774601.806 2048.3754792 8287.178584   88753.41284
## StarRating        2048.375    0.5718875   -1.048528      36.95522
## Airport           8287.179   -1.0485276  518.013328    -205.32017
## HotelCapacity    88753.413   36.9552206 -205.320172    5877.26810

Relation between Pricing and various independent variables

    cor.test(cities$RoomRent,cities$StarRating) 
## 
##  Pearson's product-moment correlation
## 
## data:  cities$RoomRent and cities$StarRating
## t = 45.719, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3545660 0.3839956
## sample estimates:
##       cor 
## 0.3693734

Since p value is 2.2e-16 we can reject the null hypothesis that the correlation between pricing and star rating is not 0.

cor.test(cities$RoomRent,cities$Airport)
## 
##  Pearson's product-moment correlation
## 
## data:  cities$RoomRent and cities$Airport
## t = 5.7183, df = 13230, p-value = 1.099e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03264192 0.06663581
## sample estimates:
##        cor 
## 0.04965324

Since p value is 1.099e-08 we can reject the null hypothesis that the correlation between pricing and distance to airport is not 0.

cor.test(cities$RoomRent,cities$HotelCapacity)
## 
##  Pearson's product-moment correlation
## 
## data:  cities$RoomRent and cities$HotelCapacity
## t = 18.389, df = 13230, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1412142 0.1744430
## sample estimates:
##       cor 
## 0.1578733

Since p value is less than 2.2e-16 we can reject the null hypothesis that the correlation between pricing and hotel capacity is not 0.

T-test

Let the null hypothesis be that the RoomRent depends on the other factors(Star rating, Airport distance, Hotel Capcity)

   t.test(cities$RoomRent,cities$StarRating)
## 
##  Welch Two Sample t-test
## 
## data:  cities$RoomRent and cities$StarRating
## t = 85.813, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5345.575 5595.491
## sample estimates:
##   mean of x   mean of y 
## 5473.991838    3.458933

Since p value lower than 0.05 we can reject the null hypothesis

t.test(cities$RoomRent,cities$Airport)
## 
##  Welch Two Sample t-test
## 
## data:  cities$RoomRent and cities$Airport
## t = 85.535, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5327.875 5577.792
## sample estimates:
##  mean of x  mean of y 
## 5473.99184   21.15874

Since p value lower than 0.05 we can reject the null hypothesis

  t.test(cities$RoomRent,cities$HotelCapacity)
## 
##  Welch Two Sample t-test
## 
## data:  cities$RoomRent and cities$HotelCapacity
## t = 84.882, df = 13234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5286.515 5536.445
## sample estimates:
##  mean of x  mean of y 
## 5473.99184   62.51164

Since p value lower than 0.05 we can reject the null hypothesis

Regression Models

Y <- Room rent X <- city, Star Rating, Airport, Hotel Capacity

MUMBAI

    cities1 <- cities[which(cities$CityName=="Mumbai"),]
View(cities1)
max(cities1$RoomRent)
## [1] 24378
Mfit1<-lm(cities1$RoomRent~cities1$StarRating+cities1$Airport+cities1$HotelCapacity)
summary(Mfit1)
## 
## Call:
## lm(formula = cities1$RoomRent ~ cities1$StarRating + cities1$Airport + 
##     cities1$HotelCapacity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7427.4 -1385.0  -222.7  1203.3 15468.1 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2157.039    554.740  -3.888  0.00011 ***
## cities1$StarRating     1892.941    154.068  12.286  < 2e-16 ***
## cities1$Airport          72.650     13.290   5.466 6.37e-08 ***
## cities1$HotelCapacity     7.349      0.893   8.230 8.97e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2192 on 708 degrees of freedom
## Multiple R-squared:  0.5083, Adjusted R-squared:  0.5062 
## F-statistic:   244 on 3 and 708 DF,  p-value: < 2.2e-16

Hyderabad

cities2 <-cities[which(cities$CityName=="Hyderabad"),]
View(cities2)
max(cities2$RoomRent)
## [1] 40000
Mfit2<-lm(cities2$RoomRent~cities2$StarRating+cities2$Airport+cities2$HotelCapacity)
summary(Mfit2)
## 
## Call:
## lm(formula = cities2$RoomRent ~ cities2$StarRating + cities2$Airport + 
##     cities2$HotelCapacity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5210.5 -2192.8  -479.6  1301.5 22809.4 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -131.246   1863.540  -0.070 0.943879    
## cities2$StarRating    4703.886    392.269  11.991  < 2e-16 ***
## cities2$Airport       -541.129     55.738  -9.708  < 2e-16 ***
## cities2$HotelCapacity  -13.105      3.562  -3.679 0.000258 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3549 on 532 degrees of freedom
## Multiple R-squared:  0.4518, Adjusted R-squared:  0.4487 
## F-statistic: 146.2 on 3 and 532 DF,  p-value: < 2.2e-16

Udaipur

cities3 <-cities[which(cities$CityName=="Udaipur"),]
View(cities3)
max(cities3$RoomRent)
## [1] 52000
Mfit3<-lm(cities3$RoomRent~cities3$StarRating+cities3$Airport+cities3$HotelCapacity)
summary(Mfit3)
## 
## Call:
## lm(formula = cities3$RoomRent ~ cities3$StarRating + cities3$Airport + 
##     cities3$HotelCapacity)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21581  -4136  -1269   1969  28239 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -21863.64    2254.36  -9.698  < 2e-16 ***
## cities3$StarRating     10250.02     647.72  15.825  < 2e-16 ***
## cities3$Airport         -176.12      69.37  -2.539   0.0115 *  
## cities3$HotelCapacity     49.69      11.02   4.510 8.26e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8350 on 452 degrees of freedom
## Multiple R-squared:  0.4944, Adjusted R-squared:  0.491 
## F-statistic: 147.3 on 3 and 452 DF,  p-value: < 2.2e-16

Delhi

cities4 <-cities[which(cities$CityName=="Delhi"),]
View(cities4)
max(cities4$RoomRent)
## [1] 45000
Mfit4<-lm(cities4$RoomRent~cities4$StarRating+cities4$Airport+cities4$HotelCapacity)
summary(Mfit4)
## 
## Call:
## lm(formula = cities4$RoomRent ~ cities4$StarRating + cities4$Airport + 
##     cities4$HotelCapacity)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4480  -1115   -396    698  35735 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -6782.7047   336.3556 -20.165  < 2e-16 ***
## cities4$StarRating     2970.9032    98.2369  30.242  < 2e-16 ***
## cities4$Airport          31.7254     8.5299   3.719 0.000205 ***
## cities4$HotelCapacity     0.2545     0.7127   0.357 0.721111    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2460 on 2044 degrees of freedom
## Multiple R-squared:  0.4843, Adjusted R-squared:  0.4835 
## F-statistic: 639.8 on 3 and 2044 DF,  p-value: < 2.2e-16

Manali

cities5 <-cities[which(cities$CityName=="Manali"),]
View(cities5)
max(cities5$RoomRent)
## [1] 12134
Mfit5<-lm(cities5$RoomRent~cities5$StarRating+cities5$Airport+cities5$HotelCapacity)
summary(Mfit5)
## 
## Call:
## lm(formula = cities5$RoomRent ~ cities5$StarRating + cities5$Airport + 
##     cities5$HotelCapacity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4533.6 -1492.6  -474.4  1484.9  7315.0 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2430.915   1137.246   2.138   0.0334 *  
## cities5$StarRating    1655.426    245.705   6.737 8.95e-11 ***
## cities5$Airport        -81.959     17.271  -4.745 3.30e-06 ***
## cities5$HotelCapacity    5.710      5.999   0.952   0.3420    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2183 on 284 degrees of freedom
## Multiple R-squared:  0.2399, Adjusted R-squared:  0.2318 
## F-statistic: 29.87 on 3 and 284 DF,  p-value: < 2.2e-16

Goa

cities6 <-cities[which(cities$CityName=="Goa"),]
View(cities6)
max(cities6$RoomRent)
## [1] 41999
Mfit6<-lm(cities6$RoomRent~cities6$StarRating+cities6$Airport+cities6$HotelCapacity)
summary(Mfit6)
## 
## Call:
## lm(formula = cities6$RoomRent ~ cities6$StarRating + cities6$Airport + 
##     cities6$HotelCapacity)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8732  -2963   -911   1564  26945 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -4531.695    966.849  -4.687 3.41e-06 ***
## cities6$StarRating     3572.736    272.225  13.124  < 2e-16 ***
## cities6$Airport          26.351     17.261   1.527    0.127    
## cities6$HotelCapacity     7.759      5.155   1.505    0.133    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4993 on 620 degrees of freedom
## Multiple R-squared:  0.3378, Adjusted R-squared:  0.3346 
## F-statistic: 105.4 on 3 and 620 DF,  p-value: < 2.2e-16

Bangalore

cities7 <-cities[which(cities$CityName=="Bangalore"),]
View(cities7)
max(cities7$RoomRent)
## [1] 13500
Mfit7<-lm(cities7$RoomRent~cities7$StarRating+cities7$Airport+cities7$HotelCapacity)
summary(Mfit7)
## 
## Call:
## lm(formula = cities7$RoomRent ~ cities7$StarRating + cities7$Airport + 
##     cities7$HotelCapacity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3015.6 -1015.6  -263.5   716.7  7137.8 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -811.303    573.174  -1.415 0.157412    
## cities7$StarRating    1453.173    135.721  10.707  < 2e-16 ***
## cities7$Airport        -40.013     10.679  -3.747 0.000195 ***
## cities7$HotelCapacity    8.444      1.193   7.076 3.84e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1608 on 652 degrees of freedom
## Multiple R-squared:  0.542,  Adjusted R-squared:  0.5399 
## F-statistic: 257.2 on 3 and 652 DF,  p-value: < 2.2e-16